blob: 7728d161340660ccc48641121262a0e194655b17 [file] [log] [blame]
Philipp Reisnerb411b362009-09-25 16:07:19 -07001/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
Philipp Reisnerb411b362009-09-25 16:07:19 -070029#include <linux/module.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070030#include <linux/drbd.h>
31#include <asm/uaccess.h>
32#include <asm/types.h>
33#include <net/sock.h>
34#include <linux/ctype.h>
Arnd Bergmann2a48fc02010-06-02 14:28:52 +020035#include <linux/mutex.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070036#include <linux/fs.h>
37#include <linux/file.h>
38#include <linux/proc_fs.h>
39#include <linux/init.h>
40#include <linux/mm.h>
41#include <linux/memcontrol.h>
42#include <linux/mm_inline.h>
43#include <linux/slab.h>
44#include <linux/random.h>
45#include <linux/reboot.h>
46#include <linux/notifier.h>
47#include <linux/kthread.h>
48
49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h>
51#include <linux/vmalloc.h>
52
53#include <linux/drbd_limits.h>
54#include "drbd_int.h"
Philipp Reisnerb411b362009-09-25 16:07:19 -070055#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57#include "drbd_vli.h"
58
59struct after_state_chg_work {
60 struct drbd_work w;
61 union drbd_state os;
62 union drbd_state ns;
63 enum chg_state_flags flags;
64 struct completion *done;
65};
66
Arnd Bergmann2a48fc02010-06-02 14:28:52 +020067static DEFINE_MUTEX(drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -070068int drbdd_init(struct drbd_thread *);
69int drbd_worker(struct drbd_thread *);
70int drbd_asender(struct drbd_thread *);
71
72int drbd_init(void);
73static int drbd_open(struct block_device *bdev, fmode_t mode);
74static int drbd_release(struct gendisk *gd, fmode_t mode);
75static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
76static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
77 union drbd_state ns, enum chg_state_flags flags);
78static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
79static void md_sync_timer_fn(unsigned long data);
80static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +020081static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
Philipp Reisnerb411b362009-09-25 16:07:19 -070082
Philipp Reisnerb411b362009-09-25 16:07:19 -070083MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
84 "Lars Ellenberg <lars@linbit.com>");
85MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
86MODULE_VERSION(REL_VERSION);
87MODULE_LICENSE("GPL");
Philipp Reisner2b8a90b2011-01-10 11:15:17 +010088MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices ("
89 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
Philipp Reisnerb411b362009-09-25 16:07:19 -070090MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
91
92#include <linux/moduleparam.h>
93/* allow_open_on_secondary */
94MODULE_PARM_DESC(allow_oos, "DONT USE!");
95/* thanks to these macros, if compiled into the kernel (not-module),
96 * this becomes the boot parameter drbd.minor_count */
97module_param(minor_count, uint, 0444);
98module_param(disable_sendpage, bool, 0644);
99module_param(allow_oos, bool, 0);
100module_param(cn_idx, uint, 0444);
101module_param(proc_details, int, 0644);
102
103#ifdef CONFIG_DRBD_FAULT_INJECTION
104int enable_faults;
105int fault_rate;
106static int fault_count;
107int fault_devs;
108/* bitmap of enabled faults */
109module_param(enable_faults, int, 0664);
110/* fault rate % value - applies to all enabled faults */
111module_param(fault_rate, int, 0664);
112/* count of faults inserted */
113module_param(fault_count, int, 0664);
114/* bitmap of devices to insert faults on */
115module_param(fault_devs, int, 0644);
116#endif
117
118/* module parameter, defined */
Philipp Reisner2b8a90b2011-01-10 11:15:17 +0100119unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700120int disable_sendpage;
121int allow_oos;
122unsigned int cn_idx = CN_IDX_DRBD;
123int proc_details; /* Detail level in proc drbd*/
124
125/* Module parameter for setting the user mode helper program
126 * to run. Default is /sbin/drbdadm */
127char usermode_helper[80] = "/sbin/drbdadm";
128
129module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
130
131/* in 2.6.x, our device mapping and config info contains our virtual gendisks
132 * as member "struct gendisk *vdisk;"
133 */
134struct drbd_conf **minor_table;
Philipp Reisner21114382011-01-19 12:26:59 +0100135struct list_head drbd_tconns; /* list of struct drbd_tconn */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700136
137struct kmem_cache *drbd_request_cache;
Andreas Gruenbacher6c852be2011-02-04 15:38:52 +0100138struct kmem_cache *drbd_ee_cache; /* peer requests */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700139struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
140struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
141mempool_t *drbd_request_mempool;
142mempool_t *drbd_ee_mempool;
143
144/* I do not use a standard mempool, because:
145 1) I want to hand out the pre-allocated objects first.
146 2) I want to be able to interrupt sleeping allocation with a signal.
147 Note: This is a single linked list, the next pointer is the private
148 member of struct page.
149 */
150struct page *drbd_pp_pool;
151spinlock_t drbd_pp_lock;
152int drbd_pp_vacant;
153wait_queue_head_t drbd_pp_wait;
154
155DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
156
Emese Revfy7d4e9d02009-12-14 00:59:30 +0100157static const struct block_device_operations drbd_ops = {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700158 .owner = THIS_MODULE,
159 .open = drbd_open,
160 .release = drbd_release,
161};
162
163#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
164
165#ifdef __CHECKER__
166/* When checking with sparse, and this is an inline function, sparse will
167 give tons of false positives. When this is a real functions sparse works.
168 */
169int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
170{
171 int io_allowed;
172
173 atomic_inc(&mdev->local_cnt);
174 io_allowed = (mdev->state.disk >= mins);
175 if (!io_allowed) {
176 if (atomic_dec_and_test(&mdev->local_cnt))
177 wake_up(&mdev->misc_wait);
178 }
179 return io_allowed;
180}
181
182#endif
183
184/**
185 * DOC: The transfer log
186 *
187 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
Philipp Reisner87eeee42011-01-19 14:16:30 +0100188 * mdev->tconn->newest_tle points to the head, mdev->tconn->oldest_tle points to the tail
Philipp Reisnerb411b362009-09-25 16:07:19 -0700189 * of the list. There is always at least one &struct drbd_tl_epoch object.
190 *
191 * Each &struct drbd_tl_epoch has a circular double linked list of requests
192 * attached.
193 */
194static int tl_init(struct drbd_conf *mdev)
195{
196 struct drbd_tl_epoch *b;
197
198 /* during device minor initialization, we may well use GFP_KERNEL */
199 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
200 if (!b)
201 return 0;
202 INIT_LIST_HEAD(&b->requests);
203 INIT_LIST_HEAD(&b->w.list);
204 b->next = NULL;
205 b->br_number = 4711;
Philipp Reisner7e602c02010-05-27 14:49:27 +0200206 b->n_writes = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700207 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
208
Philipp Reisner87eeee42011-01-19 14:16:30 +0100209 mdev->tconn->oldest_tle = b;
210 mdev->tconn->newest_tle = b;
211 INIT_LIST_HEAD(&mdev->tconn->out_of_sequence_requests);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700212
Philipp Reisnerb411b362009-09-25 16:07:19 -0700213 return 1;
214}
215
216static void tl_cleanup(struct drbd_conf *mdev)
217{
Philipp Reisner87eeee42011-01-19 14:16:30 +0100218 D_ASSERT(mdev->tconn->oldest_tle == mdev->tconn->newest_tle);
219 D_ASSERT(list_empty(&mdev->tconn->out_of_sequence_requests));
220 kfree(mdev->tconn->oldest_tle);
221 mdev->tconn->oldest_tle = NULL;
222 kfree(mdev->tconn->unused_spare_tle);
223 mdev->tconn->unused_spare_tle = NULL;
Andreas Gruenbacherd6287692011-01-13 23:05:39 +0100224}
225
Philipp Reisnerb411b362009-09-25 16:07:19 -0700226/**
227 * _tl_add_barrier() - Adds a barrier to the transfer log
228 * @mdev: DRBD device.
229 * @new: Barrier to be added before the current head of the TL.
230 *
231 * The caller must hold the req_lock.
232 */
233void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
234{
235 struct drbd_tl_epoch *newest_before;
236
237 INIT_LIST_HEAD(&new->requests);
238 INIT_LIST_HEAD(&new->w.list);
239 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
240 new->next = NULL;
Philipp Reisner7e602c02010-05-27 14:49:27 +0200241 new->n_writes = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700242
Philipp Reisner87eeee42011-01-19 14:16:30 +0100243 newest_before = mdev->tconn->newest_tle;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700244 /* never send a barrier number == 0, because that is special-cased
245 * when using TCQ for our write ordering code */
246 new->br_number = (newest_before->br_number+1) ?: 1;
Philipp Reisner87eeee42011-01-19 14:16:30 +0100247 if (mdev->tconn->newest_tle != new) {
248 mdev->tconn->newest_tle->next = new;
249 mdev->tconn->newest_tle = new;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700250 }
251}
252
253/**
254 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
255 * @mdev: DRBD device.
256 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
257 * @set_size: Expected number of requests before that barrier.
258 *
259 * In case the passed barrier_nr or set_size does not match the oldest
260 * &struct drbd_tl_epoch objects this function will cause a termination
261 * of the connection.
262 */
263void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
264 unsigned int set_size)
265{
266 struct drbd_tl_epoch *b, *nob; /* next old barrier */
267 struct list_head *le, *tle;
268 struct drbd_request *r;
269
Philipp Reisner87eeee42011-01-19 14:16:30 +0100270 spin_lock_irq(&mdev->tconn->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700271
Philipp Reisner87eeee42011-01-19 14:16:30 +0100272 b = mdev->tconn->oldest_tle;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700273
274 /* first some paranoia code */
275 if (b == NULL) {
276 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
277 barrier_nr);
278 goto bail;
279 }
280 if (b->br_number != barrier_nr) {
281 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
282 barrier_nr, b->br_number);
283 goto bail;
284 }
Philipp Reisner7e602c02010-05-27 14:49:27 +0200285 if (b->n_writes != set_size) {
286 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
287 barrier_nr, set_size, b->n_writes);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700288 goto bail;
289 }
290
291 /* Clean up list of requests processed during current epoch */
292 list_for_each_safe(le, tle, &b->requests) {
293 r = list_entry(le, struct drbd_request, tl_requests);
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100294 _req_mod(r, BARRIER_ACKED);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700295 }
296 /* There could be requests on the list waiting for completion
297 of the write to the local disk. To avoid corruptions of
298 slab's data structures we have to remove the lists head.
299
300 Also there could have been a barrier ack out of sequence, overtaking
301 the write acks - which would be a bug and violating write ordering.
302 To not deadlock in case we lose connection while such requests are
303 still pending, we need some way to find them for the
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100304 _req_mode(CONNECTION_LOST_WHILE_PENDING).
Philipp Reisnerb411b362009-09-25 16:07:19 -0700305
306 These have been list_move'd to the out_of_sequence_requests list in
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100307 _req_mod(, BARRIER_ACKED) above.
Philipp Reisnerb411b362009-09-25 16:07:19 -0700308 */
309 list_del_init(&b->requests);
310
311 nob = b->next;
312 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
313 _tl_add_barrier(mdev, b);
314 if (nob)
Philipp Reisner87eeee42011-01-19 14:16:30 +0100315 mdev->tconn->oldest_tle = nob;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700316 /* if nob == NULL b was the only barrier, and becomes the new
Philipp Reisner87eeee42011-01-19 14:16:30 +0100317 barrier. Therefore mdev->tconn->oldest_tle points already to b */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700318 } else {
319 D_ASSERT(nob != NULL);
Philipp Reisner87eeee42011-01-19 14:16:30 +0100320 mdev->tconn->oldest_tle = nob;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700321 kfree(b);
322 }
323
Philipp Reisner87eeee42011-01-19 14:16:30 +0100324 spin_unlock_irq(&mdev->tconn->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700325 dec_ap_pending(mdev);
326
327 return;
328
329bail:
Philipp Reisner87eeee42011-01-19 14:16:30 +0100330 spin_unlock_irq(&mdev->tconn->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700331 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
332}
333
Philipp Reisner617049a2010-12-22 12:48:31 +0100334
Philipp Reisner11b58e72010-05-12 17:08:26 +0200335/**
336 * _tl_restart() - Walks the transfer log, and applies an action to all requests
337 * @mdev: DRBD device.
338 * @what: The action/event to perform with all request objects
339 *
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100340 * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO,
341 * RESTART_FROZEN_DISK_IO.
Philipp Reisner11b58e72010-05-12 17:08:26 +0200342 */
343static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
344{
345 struct drbd_tl_epoch *b, *tmp, **pn;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200346 struct list_head *le, *tle, carry_reads;
Philipp Reisner11b58e72010-05-12 17:08:26 +0200347 struct drbd_request *req;
348 int rv, n_writes, n_reads;
349
Philipp Reisner87eeee42011-01-19 14:16:30 +0100350 b = mdev->tconn->oldest_tle;
351 pn = &mdev->tconn->oldest_tle;
Philipp Reisner11b58e72010-05-12 17:08:26 +0200352 while (b) {
353 n_writes = 0;
354 n_reads = 0;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200355 INIT_LIST_HEAD(&carry_reads);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200356 list_for_each_safe(le, tle, &b->requests) {
357 req = list_entry(le, struct drbd_request, tl_requests);
358 rv = _req_mod(req, what);
359
360 n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
361 n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
362 }
363 tmp = b->next;
364
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200365 if (n_writes) {
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100366 if (what == RESEND) {
Philipp Reisner11b58e72010-05-12 17:08:26 +0200367 b->n_writes = n_writes;
368 if (b->w.cb == NULL) {
369 b->w.cb = w_send_barrier;
370 inc_ap_pending(mdev);
371 set_bit(CREATE_BARRIER, &mdev->flags);
372 }
373
Philipp Reisnere42325a2011-01-19 13:55:45 +0100374 drbd_queue_work(&mdev->tconn->data.work, &b->w);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200375 }
376 pn = &b->next;
377 } else {
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200378 if (n_reads)
379 list_add(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200380 /* there could still be requests on that ring list,
381 * in case local io is still pending */
382 list_del(&b->requests);
383
384 /* dec_ap_pending corresponding to queue_barrier.
385 * the newest barrier may not have been queued yet,
386 * in which case w.cb is still NULL. */
387 if (b->w.cb != NULL)
388 dec_ap_pending(mdev);
389
Philipp Reisner87eeee42011-01-19 14:16:30 +0100390 if (b == mdev->tconn->newest_tle) {
Philipp Reisner11b58e72010-05-12 17:08:26 +0200391 /* recycle, but reinit! */
392 D_ASSERT(tmp == NULL);
393 INIT_LIST_HEAD(&b->requests);
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200394 list_splice(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200395 INIT_LIST_HEAD(&b->w.list);
396 b->w.cb = NULL;
397 b->br_number = net_random();
398 b->n_writes = 0;
399
400 *pn = b;
401 break;
402 }
403 *pn = tmp;
404 kfree(b);
405 }
406 b = tmp;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200407 list_splice(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200408 }
409}
410
Philipp Reisnerb411b362009-09-25 16:07:19 -0700411
412/**
413 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
414 * @mdev: DRBD device.
415 *
416 * This is called after the connection to the peer was lost. The storage covered
417 * by the requests on the transfer gets marked as our of sync. Called from the
418 * receiver thread and the worker thread.
419 */
420void tl_clear(struct drbd_conf *mdev)
421{
Philipp Reisnerb411b362009-09-25 16:07:19 -0700422 struct list_head *le, *tle;
423 struct drbd_request *r;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700424
Philipp Reisner87eeee42011-01-19 14:16:30 +0100425 spin_lock_irq(&mdev->tconn->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700426
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100427 _tl_restart(mdev, CONNECTION_LOST_WHILE_PENDING);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700428
429 /* we expect this list to be empty. */
Philipp Reisner87eeee42011-01-19 14:16:30 +0100430 D_ASSERT(list_empty(&mdev->tconn->out_of_sequence_requests));
Philipp Reisnerb411b362009-09-25 16:07:19 -0700431
432 /* but just in case, clean it up anyways! */
Philipp Reisner87eeee42011-01-19 14:16:30 +0100433 list_for_each_safe(le, tle, &mdev->tconn->out_of_sequence_requests) {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700434 r = list_entry(le, struct drbd_request, tl_requests);
435 /* It would be nice to complete outside of spinlock.
436 * But this is easier for now. */
Andreas Gruenbacher8554df12011-01-25 15:37:43 +0100437 _req_mod(r, CONNECTION_LOST_WHILE_PENDING);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700438 }
439
440 /* ensure bit indicating barrier is required is clear */
441 clear_bit(CREATE_BARRIER, &mdev->flags);
442
Philipp Reisner87eeee42011-01-19 14:16:30 +0100443 spin_unlock_irq(&mdev->tconn->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700444}
445
Philipp Reisner11b58e72010-05-12 17:08:26 +0200446void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
447{
Philipp Reisner87eeee42011-01-19 14:16:30 +0100448 spin_lock_irq(&mdev->tconn->req_lock);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200449 _tl_restart(mdev, what);
Philipp Reisner87eeee42011-01-19 14:16:30 +0100450 spin_unlock_irq(&mdev->tconn->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700451}
452
453/**
Andreas Gruenbacher81e84652010-12-09 15:03:57 +0100454 * cl_wide_st_chg() - true if the state change is a cluster wide one
Philipp Reisnerb411b362009-09-25 16:07:19 -0700455 * @mdev: DRBD device.
456 * @os: old (current) state.
457 * @ns: new (wanted) state.
458 */
459static int cl_wide_st_chg(struct drbd_conf *mdev,
460 union drbd_state os, union drbd_state ns)
461{
462 return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
463 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
464 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
465 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
466 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
467 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
468 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
469}
470
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100471enum drbd_state_rv
472drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
473 union drbd_state mask, union drbd_state val)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700474{
475 unsigned long flags;
476 union drbd_state os, ns;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100477 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700478
Philipp Reisner87eeee42011-01-19 14:16:30 +0100479 spin_lock_irqsave(&mdev->tconn->req_lock, flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700480 os = mdev->state;
481 ns.i = (os.i & ~mask.i) | val.i;
482 rv = _drbd_set_state(mdev, ns, f, NULL);
483 ns = mdev->state;
Philipp Reisner87eeee42011-01-19 14:16:30 +0100484 spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700485
486 return rv;
487}
488
489/**
490 * drbd_force_state() - Impose a change which happens outside our control on our state
491 * @mdev: DRBD device.
492 * @mask: mask of state bits to change.
493 * @val: value of new state bits.
494 */
495void drbd_force_state(struct drbd_conf *mdev,
496 union drbd_state mask, union drbd_state val)
497{
498 drbd_change_state(mdev, CS_HARD, mask, val);
499}
500
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100501static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
502static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
503 union drbd_state,
504 union drbd_state);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700505static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
Lars Ellenberg02bc7172010-09-06 12:13:20 +0200506 union drbd_state ns, const char **warn_sync_abort);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700507int drbd_send_state_req(struct drbd_conf *,
508 union drbd_state, union drbd_state);
509
Andreas Gruenbacherc8b32562010-12-08 01:06:16 +0100510static enum drbd_state_rv
511_req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
512 union drbd_state val)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700513{
514 union drbd_state os, ns;
515 unsigned long flags;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100516 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700517
518 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
519 return SS_CW_SUCCESS;
520
521 if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
522 return SS_CW_FAILED_BY_PEER;
523
524 rv = 0;
Philipp Reisner87eeee42011-01-19 14:16:30 +0100525 spin_lock_irqsave(&mdev->tconn->req_lock, flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700526 os = mdev->state;
527 ns.i = (os.i & ~mask.i) | val.i;
528 ns = sanitize_state(mdev, os, ns, NULL);
529
530 if (!cl_wide_st_chg(mdev, os, ns))
531 rv = SS_CW_NO_NEED;
532 if (!rv) {
533 rv = is_valid_state(mdev, ns);
534 if (rv == SS_SUCCESS) {
535 rv = is_valid_state_transition(mdev, ns, os);
536 if (rv == SS_SUCCESS)
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100537 rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700538 }
539 }
Philipp Reisner87eeee42011-01-19 14:16:30 +0100540 spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700541
542 return rv;
543}
544
545/**
546 * drbd_req_state() - Perform an eventually cluster wide state change
547 * @mdev: DRBD device.
548 * @mask: mask of state bits to change.
549 * @val: value of new state bits.
550 * @f: flags
551 *
552 * Should not be called directly, use drbd_request_state() or
553 * _drbd_request_state().
554 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100555static enum drbd_state_rv
556drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
557 union drbd_state val, enum chg_state_flags f)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700558{
559 struct completion done;
560 unsigned long flags;
561 union drbd_state os, ns;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100562 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700563
564 init_completion(&done);
565
566 if (f & CS_SERIALIZE)
567 mutex_lock(&mdev->state_mutex);
568
Philipp Reisner87eeee42011-01-19 14:16:30 +0100569 spin_lock_irqsave(&mdev->tconn->req_lock, flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700570 os = mdev->state;
571 ns.i = (os.i & ~mask.i) | val.i;
572 ns = sanitize_state(mdev, os, ns, NULL);
573
574 if (cl_wide_st_chg(mdev, os, ns)) {
575 rv = is_valid_state(mdev, ns);
576 if (rv == SS_SUCCESS)
577 rv = is_valid_state_transition(mdev, ns, os);
Philipp Reisner87eeee42011-01-19 14:16:30 +0100578 spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700579
580 if (rv < SS_SUCCESS) {
581 if (f & CS_VERBOSE)
582 print_st_err(mdev, os, ns, rv);
583 goto abort;
584 }
585
586 drbd_state_lock(mdev);
587 if (!drbd_send_state_req(mdev, mask, val)) {
588 drbd_state_unlock(mdev);
589 rv = SS_CW_FAILED_BY_PEER;
590 if (f & CS_VERBOSE)
591 print_st_err(mdev, os, ns, rv);
592 goto abort;
593 }
594
595 wait_event(mdev->state_wait,
596 (rv = _req_st_cond(mdev, mask, val)));
597
598 if (rv < SS_SUCCESS) {
599 drbd_state_unlock(mdev);
600 if (f & CS_VERBOSE)
601 print_st_err(mdev, os, ns, rv);
602 goto abort;
603 }
Philipp Reisner87eeee42011-01-19 14:16:30 +0100604 spin_lock_irqsave(&mdev->tconn->req_lock, flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700605 os = mdev->state;
606 ns.i = (os.i & ~mask.i) | val.i;
607 rv = _drbd_set_state(mdev, ns, f, &done);
608 drbd_state_unlock(mdev);
609 } else {
610 rv = _drbd_set_state(mdev, ns, f, &done);
611 }
612
Philipp Reisner87eeee42011-01-19 14:16:30 +0100613 spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700614
615 if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
Philipp Reisnere6b3ea82011-01-19 14:02:01 +0100616 D_ASSERT(current != mdev->tconn->worker.task);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700617 wait_for_completion(&done);
618 }
619
620abort:
621 if (f & CS_SERIALIZE)
622 mutex_unlock(&mdev->state_mutex);
623
624 return rv;
625}
626
627/**
628 * _drbd_request_state() - Request a state change (with flags)
629 * @mdev: DRBD device.
630 * @mask: mask of state bits to change.
631 * @val: value of new state bits.
632 * @f: flags
633 *
634 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
635 * flag, or when logging of failed state change requests is not desired.
636 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100637enum drbd_state_rv
638_drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
639 union drbd_state val, enum chg_state_flags f)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700640{
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100641 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700642
643 wait_event(mdev->state_wait,
644 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
645
646 return rv;
647}
648
649static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
650{
651 dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
652 name,
653 drbd_conn_str(ns.conn),
654 drbd_role_str(ns.role),
655 drbd_role_str(ns.peer),
656 drbd_disk_str(ns.disk),
657 drbd_disk_str(ns.pdsk),
Philipp Reisnerfb22c402010-09-08 23:20:21 +0200658 is_susp(ns) ? 's' : 'r',
Philipp Reisnerb411b362009-09-25 16:07:19 -0700659 ns.aftr_isp ? 'a' : '-',
660 ns.peer_isp ? 'p' : '-',
661 ns.user_isp ? 'u' : '-'
662 );
663}
664
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100665void print_st_err(struct drbd_conf *mdev, union drbd_state os,
666 union drbd_state ns, enum drbd_state_rv err)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700667{
668 if (err == SS_IN_TRANSIENT_STATE)
669 return;
670 dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
671 print_st(mdev, " state", os);
672 print_st(mdev, "wanted", ns);
673}
674
675
Philipp Reisnerb411b362009-09-25 16:07:19 -0700676/**
677 * is_valid_state() - Returns an SS_ error code if ns is not valid
678 * @mdev: DRBD device.
679 * @ns: State to consider.
680 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100681static enum drbd_state_rv
682is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700683{
684 /* See drbd_state_sw_errors in drbd_strings.c */
685
686 enum drbd_fencing_p fp;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100687 enum drbd_state_rv rv = SS_SUCCESS;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700688
689 fp = FP_DONT_CARE;
690 if (get_ldev(mdev)) {
691 fp = mdev->ldev->dc.fencing;
692 put_ldev(mdev);
693 }
694
Philipp Reisnerb2fb6dbe2011-01-19 13:48:44 +0100695 if (get_net_conf(mdev->tconn)) {
Philipp Reisner89e58e72011-01-19 13:12:45 +0100696 if (!mdev->tconn->net_conf->two_primaries &&
Philipp Reisnerb411b362009-09-25 16:07:19 -0700697 ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
698 rv = SS_TWO_PRIMARIES;
Philipp Reisnerb2fb6dbe2011-01-19 13:48:44 +0100699 put_net_conf(mdev->tconn);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700700 }
701
702 if (rv <= 0)
703 /* already found a reason to abort */;
704 else if (ns.role == R_SECONDARY && mdev->open_cnt)
705 rv = SS_DEVICE_IN_USE;
706
707 else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
708 rv = SS_NO_UP_TO_DATE_DISK;
709
710 else if (fp >= FP_RESOURCE &&
711 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
712 rv = SS_PRIMARY_NOP;
713
714 else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
715 rv = SS_NO_UP_TO_DATE_DISK;
716
717 else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
718 rv = SS_NO_LOCAL_DISK;
719
720 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
721 rv = SS_NO_REMOTE_DISK;
722
Lars Ellenberg8d4ce822010-04-01 16:59:32 +0200723 else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
724 rv = SS_NO_UP_TO_DATE_DISK;
725
Philipp Reisnerb411b362009-09-25 16:07:19 -0700726 else if ((ns.conn == C_CONNECTED ||
727 ns.conn == C_WF_BITMAP_S ||
728 ns.conn == C_SYNC_SOURCE ||
729 ns.conn == C_PAUSED_SYNC_S) &&
730 ns.disk == D_OUTDATED)
731 rv = SS_CONNECTED_OUTDATES;
732
733 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
734 (mdev->sync_conf.verify_alg[0] == 0))
735 rv = SS_NO_VERIFY_ALG;
736
737 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
Philipp Reisner31890f42011-01-19 14:12:51 +0100738 mdev->tconn->agreed_pro_version < 88)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700739 rv = SS_NOT_SUPPORTED;
740
Philipp Reisnerfa7d9392011-05-17 14:48:55 +0200741 else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
742 rv = SS_CONNECTED_OUTDATES;
743
Philipp Reisnerb411b362009-09-25 16:07:19 -0700744 return rv;
745}
746
747/**
748 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
749 * @mdev: DRBD device.
750 * @ns: new state.
751 * @os: old state.
752 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100753static enum drbd_state_rv
754is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
755 union drbd_state os)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700756{
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100757 enum drbd_state_rv rv = SS_SUCCESS;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700758
759 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
760 os.conn > C_CONNECTED)
761 rv = SS_RESYNC_RUNNING;
762
763 if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
764 rv = SS_ALREADY_STANDALONE;
765
766 if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
767 rv = SS_IS_DISKLESS;
768
769 if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
770 rv = SS_NO_NET_CONFIG;
771
772 if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
773 rv = SS_LOWER_THAN_OUTDATED;
774
775 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
776 rv = SS_IN_TRANSIENT_STATE;
777
778 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
779 rv = SS_IN_TRANSIENT_STATE;
780
781 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
782 rv = SS_NEED_CONNECTION;
783
784 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
785 ns.conn != os.conn && os.conn > C_CONNECTED)
786 rv = SS_RESYNC_RUNNING;
787
788 if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
789 os.conn < C_CONNECTED)
790 rv = SS_NEED_CONNECTION;
791
Philipp Reisner1fc80cf2010-11-22 14:18:47 +0100792 if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
793 && os.conn < C_WF_REPORT_PARAMS)
794 rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
795
Philipp Reisnerb411b362009-09-25 16:07:19 -0700796 return rv;
797}
798
799/**
800 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
801 * @mdev: DRBD device.
802 * @os: old state.
803 * @ns: new state.
804 * @warn_sync_abort:
805 *
806 * When we loose connection, we have to set the state of the peers disk (pdsk)
807 * to D_UNKNOWN. This rule and many more along those lines are in this function.
808 */
809static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
Lars Ellenberg02bc7172010-09-06 12:13:20 +0200810 union drbd_state ns, const char **warn_sync_abort)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700811{
812 enum drbd_fencing_p fp;
Philipp Reisnerab17b68f2010-11-17 16:54:36 +0100813 enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700814
815 fp = FP_DONT_CARE;
816 if (get_ldev(mdev)) {
817 fp = mdev->ldev->dc.fencing;
818 put_ldev(mdev);
819 }
820
821 /* Disallow Network errors to configure a device's network part */
822 if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
823 os.conn <= C_DISCONNECTING)
824 ns.conn = os.conn;
825
Lars Ellenbergf2906e12010-07-21 17:04:32 +0200826 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
827 * If you try to go into some Sync* state, that shall fail (elsewhere). */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700828 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
Lars Ellenbergf2906e12010-07-21 17:04:32 +0200829 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700830 ns.conn = os.conn;
831
Lars Ellenberg82f59cc2010-10-16 12:13:47 +0200832 /* we cannot fail (again) if we already detached */
833 if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
834 ns.disk = D_DISKLESS;
835
836 /* if we are only D_ATTACHING yet,
837 * we can (and should) go directly to D_DISKLESS. */
838 if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
839 ns.disk = D_DISKLESS;
840
Philipp Reisnerb411b362009-09-25 16:07:19 -0700841 /* After C_DISCONNECTING only C_STANDALONE may follow */
842 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
843 ns.conn = os.conn;
844
845 if (ns.conn < C_CONNECTED) {
846 ns.peer_isp = 0;
847 ns.peer = R_UNKNOWN;
848 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
849 ns.pdsk = D_UNKNOWN;
850 }
851
852 /* Clear the aftr_isp when becoming unconfigured */
853 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
854 ns.aftr_isp = 0;
855
Philipp Reisnerb411b362009-09-25 16:07:19 -0700856 /* Abort resync if a disk fails/detaches */
857 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
858 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
859 if (warn_sync_abort)
Lars Ellenberg02bc7172010-09-06 12:13:20 +0200860 *warn_sync_abort =
861 os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
862 "Online-verify" : "Resync";
Philipp Reisnerb411b362009-09-25 16:07:19 -0700863 ns.conn = C_CONNECTED;
864 }
865
Philipp Reisnerb411b362009-09-25 16:07:19 -0700866 /* Connection breaks down before we finished "Negotiating" */
867 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
868 get_ldev_if_state(mdev, D_NEGOTIATING)) {
869 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
870 ns.disk = mdev->new_state_tmp.disk;
871 ns.pdsk = mdev->new_state_tmp.pdsk;
872 } else {
873 dev_alert(DEV, "Connection lost while negotiating, no data!\n");
874 ns.disk = D_DISKLESS;
875 ns.pdsk = D_UNKNOWN;
876 }
877 put_ldev(mdev);
878 }
879
Philipp Reisnerab17b68f2010-11-17 16:54:36 +0100880 /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
881 if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
882 if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
883 ns.disk = D_UP_TO_DATE;
884 if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
885 ns.pdsk = D_UP_TO_DATE;
886 }
887
888 /* Implications of the connection stat on the disk states */
889 disk_min = D_DISKLESS;
890 disk_max = D_UP_TO_DATE;
891 pdsk_min = D_INCONSISTENT;
892 pdsk_max = D_UNKNOWN;
893 switch ((enum drbd_conns)ns.conn) {
894 case C_WF_BITMAP_T:
895 case C_PAUSED_SYNC_T:
896 case C_STARTING_SYNC_T:
897 case C_WF_SYNC_UUID:
898 case C_BEHIND:
899 disk_min = D_INCONSISTENT;
900 disk_max = D_OUTDATED;
901 pdsk_min = D_UP_TO_DATE;
902 pdsk_max = D_UP_TO_DATE;
903 break;
904 case C_VERIFY_S:
905 case C_VERIFY_T:
906 disk_min = D_UP_TO_DATE;
907 disk_max = D_UP_TO_DATE;
908 pdsk_min = D_UP_TO_DATE;
909 pdsk_max = D_UP_TO_DATE;
910 break;
911 case C_CONNECTED:
912 disk_min = D_DISKLESS;
913 disk_max = D_UP_TO_DATE;
914 pdsk_min = D_DISKLESS;
915 pdsk_max = D_UP_TO_DATE;
916 break;
917 case C_WF_BITMAP_S:
918 case C_PAUSED_SYNC_S:
919 case C_STARTING_SYNC_S:
920 case C_AHEAD:
921 disk_min = D_UP_TO_DATE;
922 disk_max = D_UP_TO_DATE;
923 pdsk_min = D_INCONSISTENT;
924 pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
925 break;
926 case C_SYNC_TARGET:
927 disk_min = D_INCONSISTENT;
928 disk_max = D_INCONSISTENT;
929 pdsk_min = D_UP_TO_DATE;
930 pdsk_max = D_UP_TO_DATE;
931 break;
932 case C_SYNC_SOURCE:
933 disk_min = D_UP_TO_DATE;
934 disk_max = D_UP_TO_DATE;
935 pdsk_min = D_INCONSISTENT;
936 pdsk_max = D_INCONSISTENT;
937 break;
938 case C_STANDALONE:
939 case C_DISCONNECTING:
940 case C_UNCONNECTED:
941 case C_TIMEOUT:
942 case C_BROKEN_PIPE:
943 case C_NETWORK_FAILURE:
944 case C_PROTOCOL_ERROR:
945 case C_TEAR_DOWN:
946 case C_WF_CONNECTION:
947 case C_WF_REPORT_PARAMS:
948 case C_MASK:
949 break;
950 }
951 if (ns.disk > disk_max)
952 ns.disk = disk_max;
953
954 if (ns.disk < disk_min) {
955 dev_warn(DEV, "Implicitly set disk from %s to %s\n",
956 drbd_disk_str(ns.disk), drbd_disk_str(disk_min));
957 ns.disk = disk_min;
958 }
959 if (ns.pdsk > pdsk_max)
960 ns.pdsk = pdsk_max;
961
962 if (ns.pdsk < pdsk_min) {
963 dev_warn(DEV, "Implicitly set pdsk from %s to %s\n",
964 drbd_disk_str(ns.pdsk), drbd_disk_str(pdsk_min));
965 ns.pdsk = pdsk_min;
966 }
967
Philipp Reisnerb411b362009-09-25 16:07:19 -0700968 if (fp == FP_STONITH &&
Philipp Reisner0a492162009-10-21 13:08:29 +0200969 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
970 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
Philipp Reisnerfb22c402010-09-08 23:20:21 +0200971 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
Philipp Reisner265be2d2010-05-31 10:14:17 +0200972
973 if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
974 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
975 !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
Philipp Reisnerfb22c402010-09-08 23:20:21 +0200976 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700977
978 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
979 if (ns.conn == C_SYNC_SOURCE)
980 ns.conn = C_PAUSED_SYNC_S;
981 if (ns.conn == C_SYNC_TARGET)
982 ns.conn = C_PAUSED_SYNC_T;
983 } else {
984 if (ns.conn == C_PAUSED_SYNC_S)
985 ns.conn = C_SYNC_SOURCE;
986 if (ns.conn == C_PAUSED_SYNC_T)
987 ns.conn = C_SYNC_TARGET;
988 }
989
990 return ns;
991}
992
993/* helper for __drbd_set_state */
994static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
995{
Philipp Reisner31890f42011-01-19 14:12:51 +0100996 if (mdev->tconn->agreed_pro_version < 90)
Lars Ellenberg30b743a2010-11-05 09:39:06 +0100997 mdev->ov_start_sector = 0;
998 mdev->rs_total = drbd_bm_bits(mdev);
999 mdev->ov_position = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001000 if (cs == C_VERIFY_T) {
1001 /* starting online verify from an arbitrary position
1002 * does not fit well into the existing protocol.
1003 * on C_VERIFY_T, we initialize ov_left and friends
1004 * implicitly in receive_DataRequest once the
1005 * first P_OV_REQUEST is received */
1006 mdev->ov_start_sector = ~(sector_t)0;
1007 } else {
1008 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001009 if (bit >= mdev->rs_total) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001010 mdev->ov_start_sector =
1011 BM_BIT_TO_SECT(mdev->rs_total - 1);
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001012 mdev->rs_total = 1;
1013 } else
1014 mdev->rs_total -= bit;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001015 mdev->ov_position = mdev->ov_start_sector;
1016 }
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001017 mdev->ov_left = mdev->rs_total;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001018}
1019
Philipp Reisner07782862010-08-31 12:00:50 +02001020static void drbd_resume_al(struct drbd_conf *mdev)
1021{
1022 if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
1023 dev_info(DEV, "Resumed AL updates\n");
1024}
1025
Philipp Reisnerb411b362009-09-25 16:07:19 -07001026/**
1027 * __drbd_set_state() - Set a new DRBD state
1028 * @mdev: DRBD device.
1029 * @ns: new state.
1030 * @flags: Flags
1031 * @done: Optional completion, that will get completed after the after_state_ch() finished
1032 *
1033 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
1034 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +01001035enum drbd_state_rv
1036__drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
1037 enum chg_state_flags flags, struct completion *done)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001038{
1039 union drbd_state os;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +01001040 enum drbd_state_rv rv = SS_SUCCESS;
Lars Ellenberg02bc7172010-09-06 12:13:20 +02001041 const char *warn_sync_abort = NULL;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001042 struct after_state_chg_work *ascw;
1043
1044 os = mdev->state;
1045
1046 ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
1047
1048 if (ns.i == os.i)
1049 return SS_NOTHING_TO_DO;
1050
1051 if (!(flags & CS_HARD)) {
1052 /* pre-state-change checks ; only look at ns */
1053 /* See drbd_state_sw_errors in drbd_strings.c */
1054
1055 rv = is_valid_state(mdev, ns);
1056 if (rv < SS_SUCCESS) {
1057 /* If the old state was illegal as well, then let
1058 this happen...*/
1059
Philipp Reisner1616a252010-06-10 16:55:15 +02001060 if (is_valid_state(mdev, os) == rv)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001061 rv = is_valid_state_transition(mdev, ns, os);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001062 } else
1063 rv = is_valid_state_transition(mdev, ns, os);
1064 }
1065
1066 if (rv < SS_SUCCESS) {
1067 if (flags & CS_VERBOSE)
1068 print_st_err(mdev, os, ns, rv);
1069 return rv;
1070 }
1071
1072 if (warn_sync_abort)
Lars Ellenberg02bc7172010-09-06 12:13:20 +02001073 dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001074
1075 {
Andreas Gruenbacher662d91a2010-12-07 03:01:41 +01001076 char *pbp, pb[300];
1077 pbp = pb;
1078 *pbp = 0;
1079 if (ns.role != os.role)
1080 pbp += sprintf(pbp, "role( %s -> %s ) ",
1081 drbd_role_str(os.role),
1082 drbd_role_str(ns.role));
1083 if (ns.peer != os.peer)
1084 pbp += sprintf(pbp, "peer( %s -> %s ) ",
1085 drbd_role_str(os.peer),
1086 drbd_role_str(ns.peer));
1087 if (ns.conn != os.conn)
1088 pbp += sprintf(pbp, "conn( %s -> %s ) ",
1089 drbd_conn_str(os.conn),
1090 drbd_conn_str(ns.conn));
1091 if (ns.disk != os.disk)
1092 pbp += sprintf(pbp, "disk( %s -> %s ) ",
1093 drbd_disk_str(os.disk),
1094 drbd_disk_str(ns.disk));
1095 if (ns.pdsk != os.pdsk)
1096 pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
1097 drbd_disk_str(os.pdsk),
1098 drbd_disk_str(ns.pdsk));
1099 if (is_susp(ns) != is_susp(os))
1100 pbp += sprintf(pbp, "susp( %d -> %d ) ",
1101 is_susp(os),
1102 is_susp(ns));
1103 if (ns.aftr_isp != os.aftr_isp)
1104 pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
1105 os.aftr_isp,
1106 ns.aftr_isp);
1107 if (ns.peer_isp != os.peer_isp)
1108 pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
1109 os.peer_isp,
1110 ns.peer_isp);
1111 if (ns.user_isp != os.user_isp)
1112 pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
1113 os.user_isp,
1114 ns.user_isp);
1115 dev_info(DEV, "%s\n", pb);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001116 }
1117
1118 /* solve the race between becoming unconfigured,
1119 * worker doing the cleanup, and
1120 * admin reconfiguring us:
1121 * on (re)configure, first set CONFIG_PENDING,
1122 * then wait for a potentially exiting worker,
1123 * start the worker, and schedule one no_op.
1124 * then proceed with configuration.
1125 */
1126 if (ns.disk == D_DISKLESS &&
1127 ns.conn == C_STANDALONE &&
1128 ns.role == R_SECONDARY &&
1129 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1130 set_bit(DEVICE_DYING, &mdev->flags);
1131
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001132 /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1133 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1134 * drbd_ldev_destroy() won't happen before our corresponding
1135 * after_state_ch works run, where we put_ldev again. */
1136 if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1137 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1138 atomic_inc(&mdev->local_cnt);
1139
1140 mdev->state = ns;
Lars Ellenberg62b0da32011-01-20 13:25:21 +01001141
1142 if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
1143 drbd_print_uuids(mdev, "attached to UUIDs");
1144
Philipp Reisnerb411b362009-09-25 16:07:19 -07001145 wake_up(&mdev->misc_wait);
1146 wake_up(&mdev->state_wait);
1147
Philipp Reisnerb411b362009-09-25 16:07:19 -07001148 /* aborted verify run. log the last position */
1149 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1150 ns.conn < C_CONNECTED) {
1151 mdev->ov_start_sector =
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001152 BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001153 dev_info(DEV, "Online Verify reached sector %llu\n",
1154 (unsigned long long)mdev->ov_start_sector);
1155 }
1156
1157 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1158 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1159 dev_info(DEV, "Syncer continues.\n");
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001160 mdev->rs_paused += (long)jiffies
1161 -(long)mdev->rs_mark_time[mdev->rs_last_mark];
Philipp Reisner63106d32010-09-01 15:47:15 +02001162 if (ns.conn == C_SYNC_TARGET)
1163 mod_timer(&mdev->resync_timer, jiffies);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001164 }
1165
1166 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1167 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1168 dev_info(DEV, "Resync suspended\n");
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001169 mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001170 }
1171
1172 if (os.conn == C_CONNECTED &&
1173 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001174 unsigned long now = jiffies;
1175 int i;
1176
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001177 set_ov_position(mdev, ns.conn);
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001178 mdev->rs_start = now;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02001179 mdev->rs_last_events = 0;
1180 mdev->rs_last_sect_ev = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001181 mdev->ov_last_oos_size = 0;
1182 mdev->ov_last_oos_start = 0;
1183
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001184 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001185 mdev->rs_mark_left[i] = mdev->ov_left;
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001186 mdev->rs_mark_time[i] = now;
1187 }
1188
Lars Ellenberg2649f082010-11-05 10:05:47 +01001189 drbd_rs_controller_reset(mdev);
1190
Philipp Reisnerb411b362009-09-25 16:07:19 -07001191 if (ns.conn == C_VERIFY_S) {
1192 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1193 (unsigned long long)mdev->ov_position);
1194 mod_timer(&mdev->resync_timer, jiffies);
1195 }
1196 }
1197
1198 if (get_ldev(mdev)) {
1199 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1200 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1201 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1202
1203 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1204 mdf |= MDF_CRASHED_PRIMARY;
1205 if (mdev->state.role == R_PRIMARY ||
1206 (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1207 mdf |= MDF_PRIMARY_IND;
1208 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1209 mdf |= MDF_CONNECTED_IND;
1210 if (mdev->state.disk > D_INCONSISTENT)
1211 mdf |= MDF_CONSISTENT;
1212 if (mdev->state.disk > D_OUTDATED)
1213 mdf |= MDF_WAS_UP_TO_DATE;
1214 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1215 mdf |= MDF_PEER_OUT_DATED;
1216 if (mdf != mdev->ldev->md.flags) {
1217 mdev->ldev->md.flags = mdf;
1218 drbd_md_mark_dirty(mdev);
1219 }
1220 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1221 drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1222 put_ldev(mdev);
1223 }
1224
1225 /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1226 if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1227 os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1228 set_bit(CONSIDER_RESYNC, &mdev->flags);
1229
1230 /* Receiver should clean up itself */
1231 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
Philipp Reisnere6b3ea82011-01-19 14:02:01 +01001232 drbd_thread_stop_nowait(&mdev->tconn->receiver);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001233
1234 /* Now the receiver finished cleaning up itself, it should die */
1235 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
Philipp Reisnere6b3ea82011-01-19 14:02:01 +01001236 drbd_thread_stop_nowait(&mdev->tconn->receiver);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001237
1238 /* Upon network failure, we need to restart the receiver. */
1239 if (os.conn > C_TEAR_DOWN &&
1240 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
Philipp Reisnere6b3ea82011-01-19 14:02:01 +01001241 drbd_thread_restart_nowait(&mdev->tconn->receiver);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001242
Philipp Reisner07782862010-08-31 12:00:50 +02001243 /* Resume AL writing if we get a connection */
1244 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1245 drbd_resume_al(mdev);
1246
Philipp Reisnerb411b362009-09-25 16:07:19 -07001247 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1248 if (ascw) {
1249 ascw->os = os;
1250 ascw->ns = ns;
1251 ascw->flags = flags;
1252 ascw->w.cb = w_after_state_ch;
1253 ascw->done = done;
Philipp Reisnere42325a2011-01-19 13:55:45 +01001254 drbd_queue_work(&mdev->tconn->data.work, &ascw->w);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001255 } else {
1256 dev_warn(DEV, "Could not kmalloc an ascw\n");
1257 }
1258
1259 return rv;
1260}
1261
1262static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1263{
1264 struct after_state_chg_work *ascw =
1265 container_of(w, struct after_state_chg_work, w);
1266 after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1267 if (ascw->flags & CS_WAIT_COMPLETE) {
1268 D_ASSERT(ascw->done != NULL);
1269 complete(ascw->done);
1270 }
1271 kfree(ascw);
1272
1273 return 1;
1274}
1275
1276static void abw_start_sync(struct drbd_conf *mdev, int rv)
1277{
1278 if (rv) {
1279 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1280 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1281 return;
1282 }
1283
1284 switch (mdev->state.conn) {
1285 case C_STARTING_SYNC_T:
1286 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1287 break;
1288 case C_STARTING_SYNC_S:
1289 drbd_start_resync(mdev, C_SYNC_SOURCE);
1290 break;
1291 }
1292}
1293
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001294int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
1295 int (*io_fn)(struct drbd_conf *),
1296 char *why, enum bm_flag flags)
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001297{
1298 int rv;
1299
Philipp Reisnere6b3ea82011-01-19 14:02:01 +01001300 D_ASSERT(current == mdev->tconn->worker.task);
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001301
1302 /* open coded non-blocking drbd_suspend_io(mdev); */
1303 set_bit(SUSPEND_IO, &mdev->flags);
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001304
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001305 drbd_bm_lock(mdev, why, flags);
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001306 rv = io_fn(mdev);
1307 drbd_bm_unlock(mdev);
1308
1309 drbd_resume_io(mdev);
1310
1311 return rv;
1312}
1313
Philipp Reisnerb411b362009-09-25 16:07:19 -07001314/**
1315 * after_state_ch() - Perform after state change actions that may sleep
1316 * @mdev: DRBD device.
1317 * @os: old state.
1318 * @ns: new state.
1319 * @flags: Flags
1320 */
1321static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1322 union drbd_state ns, enum chg_state_flags flags)
1323{
1324 enum drbd_fencing_p fp;
Andreas Gruenbacher8554df12011-01-25 15:37:43 +01001325 enum drbd_req_event what = NOTHING;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001326 union drbd_state nsm = (union drbd_state){ .i = -1 };
Philipp Reisnerb411b362009-09-25 16:07:19 -07001327
1328 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1329 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1330 if (mdev->p_uuid)
1331 mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1332 }
1333
1334 fp = FP_DONT_CARE;
1335 if (get_ldev(mdev)) {
1336 fp = mdev->ldev->dc.fencing;
1337 put_ldev(mdev);
1338 }
1339
1340 /* Inform userspace about the change... */
1341 drbd_bcast_state(mdev, ns);
1342
1343 if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1344 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1345 drbd_khelper(mdev, "pri-on-incon-degr");
1346
1347 /* Here we have the actions that are performed after a
1348 state change. This function might sleep */
1349
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001350 nsm.i = -1;
1351 if (ns.susp_nod) {
Philipp Reisner3f986882010-12-20 14:48:20 +01001352 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
Andreas Gruenbacher8554df12011-01-25 15:37:43 +01001353 what = RESEND;
Philipp Reisner265be2d2010-05-31 10:14:17 +02001354
Philipp Reisner67098932010-06-24 16:24:25 +02001355 if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
Andreas Gruenbacher8554df12011-01-25 15:37:43 +01001356 what = RESTART_FROZEN_DISK_IO;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001357
Andreas Gruenbacher8554df12011-01-25 15:37:43 +01001358 if (what != NOTHING)
Philipp Reisner3f986882010-12-20 14:48:20 +01001359 nsm.susp_nod = 0;
Philipp Reisner265be2d2010-05-31 10:14:17 +02001360 }
1361
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001362 if (ns.susp_fen) {
Philipp Reisner43a51822010-06-11 11:26:34 +02001363 /* case1: The outdate peer handler is successful: */
1364 if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001365 tl_clear(mdev);
Philipp Reisner43a51822010-06-11 11:26:34 +02001366 if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1367 drbd_uuid_new_current(mdev);
1368 clear_bit(NEW_CUR_UUID, &mdev->flags);
Philipp Reisner43a51822010-06-11 11:26:34 +02001369 }
Philipp Reisner87eeee42011-01-19 14:16:30 +01001370 spin_lock_irq(&mdev->tconn->req_lock);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001371 _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
Philipp Reisner87eeee42011-01-19 14:16:30 +01001372 spin_unlock_irq(&mdev->tconn->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001373 }
Philipp Reisner43a51822010-06-11 11:26:34 +02001374 /* case2: The connection was established again: */
1375 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1376 clear_bit(NEW_CUR_UUID, &mdev->flags);
Andreas Gruenbacher8554df12011-01-25 15:37:43 +01001377 what = RESEND;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001378 nsm.susp_fen = 0;
Philipp Reisner43a51822010-06-11 11:26:34 +02001379 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001380 }
Philipp Reisner67098932010-06-24 16:24:25 +02001381
Andreas Gruenbacher8554df12011-01-25 15:37:43 +01001382 if (what != NOTHING) {
Philipp Reisner87eeee42011-01-19 14:16:30 +01001383 spin_lock_irq(&mdev->tconn->req_lock);
Philipp Reisner67098932010-06-24 16:24:25 +02001384 _tl_restart(mdev, what);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001385 nsm.i &= mdev->state.i;
1386 _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
Philipp Reisner87eeee42011-01-19 14:16:30 +01001387 spin_unlock_irq(&mdev->tconn->req_lock);
Philipp Reisner67098932010-06-24 16:24:25 +02001388 }
1389
Lars Ellenberg5a22db82010-12-17 21:14:23 +01001390 /* Became sync source. With protocol >= 96, we still need to send out
1391 * the sync uuid now. Need to do that before any drbd_send_state, or
1392 * the other side may go "paused sync" before receiving the sync uuids,
1393 * which is unexpected. */
1394 if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
1395 (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
Philipp Reisner31890f42011-01-19 14:12:51 +01001396 mdev->tconn->agreed_pro_version >= 96 && get_ldev(mdev)) {
Lars Ellenberg5a22db82010-12-17 21:14:23 +01001397 drbd_gen_and_send_sync_uuid(mdev);
1398 put_ldev(mdev);
1399 }
1400
Philipp Reisnerb411b362009-09-25 16:07:19 -07001401 /* Do not change the order of the if above and the two below... */
1402 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
1403 drbd_send_uuids(mdev);
1404 drbd_send_state(mdev);
1405 }
Lars Ellenberg54b956a2011-01-20 10:47:53 +01001406 /* No point in queuing send_bitmap if we don't have a connection
1407 * anymore, so check also the _current_ state, not only the new state
1408 * at the time this work was queued. */
1409 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
1410 mdev->state.conn == C_WF_BITMAP_S)
1411 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001412 "send_bitmap (WFBitMapS)",
1413 BM_LOCKED_TEST_ALLOWED);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001414
1415 /* Lost contact to peer's copy of the data */
1416 if ((os.pdsk >= D_INCONSISTENT &&
1417 os.pdsk != D_UNKNOWN &&
1418 os.pdsk != D_OUTDATED)
1419 && (ns.pdsk < D_INCONSISTENT ||
1420 ns.pdsk == D_UNKNOWN ||
1421 ns.pdsk == D_OUTDATED)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001422 if (get_ldev(mdev)) {
1423 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001424 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001425 if (is_susp(mdev->state)) {
Philipp Reisner43a51822010-06-11 11:26:34 +02001426 set_bit(NEW_CUR_UUID, &mdev->flags);
1427 } else {
1428 drbd_uuid_new_current(mdev);
1429 drbd_send_uuids(mdev);
1430 }
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001431 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001432 put_ldev(mdev);
1433 }
1434 }
1435
1436 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
Philipp Reisner18a50fa2010-06-21 14:14:15 +02001437 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001438 drbd_uuid_new_current(mdev);
Philipp Reisner18a50fa2010-06-21 14:14:15 +02001439 drbd_send_uuids(mdev);
1440 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001441
1442 /* D_DISKLESS Peer becomes secondary */
1443 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001444 /* We may still be Primary ourselves.
1445 * No harm done if the bitmap still changes,
1446 * redirtied pages will follow later. */
1447 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1448 "demote diskless peer", BM_LOCKED_SET_ALLOWED);
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001449 put_ldev(mdev);
1450 }
1451
Lars Ellenberg06d33e92010-12-18 17:00:59 +01001452 /* Write out all changed bits on demote.
1453 * Though, no need to da that just yet
1454 * if there is a resync going on still */
1455 if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
1456 mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001457 /* No changes to the bitmap expected this time, so assert that,
1458 * even though no harm was done if it did change. */
1459 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1460 "demote", BM_LOCKED_TEST_ALLOWED);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001461 put_ldev(mdev);
1462 }
1463
1464 /* Last part of the attaching process ... */
1465 if (ns.conn >= C_CONNECTED &&
1466 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
Philipp Reisnere89b5912010-03-24 17:11:33 +01001467 drbd_send_sizes(mdev, 0, 0); /* to start sync... */
Philipp Reisnerb411b362009-09-25 16:07:19 -07001468 drbd_send_uuids(mdev);
1469 drbd_send_state(mdev);
1470 }
1471
1472 /* We want to pause/continue resync, tell peer. */
1473 if (ns.conn >= C_CONNECTED &&
1474 ((os.aftr_isp != ns.aftr_isp) ||
1475 (os.user_isp != ns.user_isp)))
1476 drbd_send_state(mdev);
1477
1478 /* In case one of the isp bits got set, suspend other devices. */
1479 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1480 (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1481 suspend_other_sg(mdev);
1482
1483 /* Make sure the peer gets informed about eventual state
1484 changes (ISP bits) while we were in WFReportParams. */
1485 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1486 drbd_send_state(mdev);
1487
Philipp Reisner67531712010-10-27 12:21:30 +02001488 if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
1489 drbd_send_state(mdev);
1490
Philipp Reisnerb411b362009-09-25 16:07:19 -07001491 /* We are in the progress to start a full sync... */
1492 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1493 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001494 /* no other bitmap changes expected during this phase */
1495 drbd_queue_bitmap_io(mdev,
1496 &drbd_bmio_set_n_write, &abw_start_sync,
1497 "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001498
1499 /* We are invalidating our self... */
1500 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1501 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001502 /* other bitmap operation expected during this phase */
1503 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
1504 "set_n_write from invalidate", BM_LOCKED_MASK);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001505
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001506 /* first half of local IO error, failure to attach,
1507 * or administrative detach */
1508 if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1509 enum drbd_io_error_p eh;
1510 int was_io_error;
1511 /* corresponding get_ldev was in __drbd_set_state, to serialize
1512 * our cleanup here with the transition to D_DISKLESS,
1513 * so it is safe to dreference ldev here. */
1514 eh = mdev->ldev->dc.on_io_error;
1515 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
1516
1517 /* current state still has to be D_FAILED,
1518 * there is only one way out: to D_DISKLESS,
1519 * and that may only happen after our put_ldev below. */
1520 if (mdev->state.disk != D_FAILED)
1521 dev_err(DEV,
1522 "ASSERT FAILED: disk is %s during detach\n",
1523 drbd_disk_str(mdev->state.disk));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001524
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001525 if (drbd_send_state(mdev))
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001526 dev_warn(DEV, "Notified peer that I am detaching my disk\n");
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001527 else
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001528 dev_err(DEV, "Sending state for detaching disk failed\n");
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001529
1530 drbd_rs_cancel_all(mdev);
1531
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001532 /* In case we want to get something to stable storage still,
1533 * this may be the last chance.
1534 * Following put_ldev may transition to D_DISKLESS. */
1535 drbd_md_sync(mdev);
1536 put_ldev(mdev);
1537
1538 if (was_io_error && eh == EP_CALL_HELPER)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001539 drbd_khelper(mdev, "local-io-error");
1540 }
1541
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001542 /* second half of local IO error, failure to attach,
1543 * or administrative detach,
1544 * after local_cnt references have reached zero again */
1545 if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1546 /* We must still be diskless,
1547 * re-attach has to be serialized with this! */
1548 if (mdev->state.disk != D_DISKLESS)
1549 dev_err(DEV,
1550 "ASSERT FAILED: disk is %s while going diskless\n",
1551 drbd_disk_str(mdev->state.disk));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001552
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001553 mdev->rs_total = 0;
1554 mdev->rs_failed = 0;
1555 atomic_set(&mdev->rs_pending_cnt, 0);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001556
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001557 if (drbd_send_state(mdev))
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001558 dev_warn(DEV, "Notified peer that I'm now diskless.\n");
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001559 /* corresponding get_ldev in __drbd_set_state
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001560 * this may finally trigger drbd_ldev_destroy. */
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001561 put_ldev(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001562 }
1563
Philipp Reisner738a84b2011-03-03 00:21:30 +01001564 /* Notify peer that I had a local IO error, and did not detached.. */
1565 if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT)
1566 drbd_send_state(mdev);
1567
Philipp Reisnerb411b362009-09-25 16:07:19 -07001568 /* Disks got bigger while they were detached */
1569 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1570 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1571 if (ns.conn == C_CONNECTED)
1572 resync_after_online_grow(mdev);
1573 }
1574
1575 /* A resync finished or aborted, wake paused devices... */
1576 if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1577 (os.peer_isp && !ns.peer_isp) ||
1578 (os.user_isp && !ns.user_isp))
1579 resume_next_sg(mdev);
1580
Lars Ellenbergaf85e8e2010-10-07 16:07:55 +02001581 /* sync target done with resync. Explicitly notify peer, even though
1582 * it should (at least for non-empty resyncs) already know itself. */
1583 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1584 drbd_send_state(mdev);
1585
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001586 /* This triggers bitmap writeout of potentially still unwritten pages
1587 * if the resync finished cleanly, or aborted because of peer disk
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001588 * failure, or because of connection loss.
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001589 * For resync aborted because of local disk failure, we cannot do
1590 * any bitmap writeout anymore.
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001591 * No harm done if some bits change during this phase.
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001592 */
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001593 if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
1594 drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL,
1595 "write from resync_finished", BM_LOCKED_SET_ALLOWED);
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001596 put_ldev(mdev);
1597 }
Lars Ellenberg02851e92010-12-16 14:47:39 +01001598
Philipp Reisnerb411b362009-09-25 16:07:19 -07001599 /* Upon network connection, we need to start the receiver */
1600 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
Philipp Reisnere6b3ea82011-01-19 14:02:01 +01001601 drbd_thread_start(&mdev->tconn->receiver);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001602
1603 /* Terminate worker thread if we are unconfigured - it will be
1604 restarted as needed... */
1605 if (ns.disk == D_DISKLESS &&
1606 ns.conn == C_STANDALONE &&
1607 ns.role == R_SECONDARY) {
1608 if (os.aftr_isp != ns.aftr_isp)
1609 resume_next_sg(mdev);
1610 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1611 if (test_bit(DEVICE_DYING, &mdev->flags))
Philipp Reisnere6b3ea82011-01-19 14:02:01 +01001612 drbd_thread_stop_nowait(&mdev->tconn->worker);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001613 }
1614
1615 drbd_md_sync(mdev);
1616}
1617
1618
1619static int drbd_thread_setup(void *arg)
1620{
1621 struct drbd_thread *thi = (struct drbd_thread *) arg;
1622 struct drbd_conf *mdev = thi->mdev;
1623 unsigned long flags;
1624 int retval;
1625
1626restart:
1627 retval = thi->function(thi);
1628
1629 spin_lock_irqsave(&thi->t_lock, flags);
1630
Andreas Gruenbachere77a0a52011-01-25 15:43:39 +01001631 /* if the receiver has been "EXITING", the last thing it did
Philipp Reisnerb411b362009-09-25 16:07:19 -07001632 * was set the conn state to "StandAlone",
1633 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1634 * and receiver thread will be "started".
Andreas Gruenbachere77a0a52011-01-25 15:43:39 +01001635 * drbd_thread_start needs to set "RESTARTING" in that case.
Philipp Reisnerb411b362009-09-25 16:07:19 -07001636 * t_state check and assignment needs to be within the same spinlock,
Andreas Gruenbachere77a0a52011-01-25 15:43:39 +01001637 * so either thread_start sees EXITING, and can remap to RESTARTING,
1638 * or thread_start see NONE, and can proceed as normal.
Philipp Reisnerb411b362009-09-25 16:07:19 -07001639 */
1640
Andreas Gruenbachere77a0a52011-01-25 15:43:39 +01001641 if (thi->t_state == RESTARTING) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001642 dev_info(DEV, "Restarting %s\n", current->comm);
Andreas Gruenbachere77a0a52011-01-25 15:43:39 +01001643 thi->t_state = RUNNING;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001644 spin_unlock_irqrestore(&thi->t_lock, flags);
1645 goto restart;
1646 }
1647
1648 thi->task = NULL;
Andreas Gruenbachere77a0a52011-01-25 15:43:39 +01001649 thi->t_state = NONE;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001650 smp_mb();
1651 complete(&thi->stop);
1652 spin_unlock_irqrestore(&thi->t_lock, flags);
1653
1654 dev_info(DEV, "Terminating %s\n", current->comm);
1655
1656 /* Release mod reference taken when thread was started */
1657 module_put(THIS_MODULE);
1658 return retval;
1659}
1660
1661static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1662 int (*func) (struct drbd_thread *))
1663{
1664 spin_lock_init(&thi->t_lock);
1665 thi->task = NULL;
Andreas Gruenbachere77a0a52011-01-25 15:43:39 +01001666 thi->t_state = NONE;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001667 thi->function = func;
1668 thi->mdev = mdev;
1669}
1670
1671int drbd_thread_start(struct drbd_thread *thi)
1672{
1673 struct drbd_conf *mdev = thi->mdev;
1674 struct task_struct *nt;
1675 unsigned long flags;
1676
1677 const char *me =
Philipp Reisnere6b3ea82011-01-19 14:02:01 +01001678 thi == &mdev->tconn->receiver ? "receiver" :
1679 thi == &mdev->tconn->asender ? "asender" :
1680 thi == &mdev->tconn->worker ? "worker" : "NONSENSE";
Philipp Reisnerb411b362009-09-25 16:07:19 -07001681
1682 /* is used from state engine doing drbd_thread_stop_nowait,
1683 * while holding the req lock irqsave */
1684 spin_lock_irqsave(&thi->t_lock, flags);
1685
1686 switch (thi->t_state) {
Andreas Gruenbachere77a0a52011-01-25 15:43:39 +01001687 case NONE:
Philipp Reisnerb411b362009-09-25 16:07:19 -07001688 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1689 me, current->comm, current->pid);
1690
1691 /* Get ref on module for thread - this is released when thread exits */
1692 if (!try_module_get(THIS_MODULE)) {
1693 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1694 spin_unlock_irqrestore(&thi->t_lock, flags);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001695 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001696 }
1697
1698 init_completion(&thi->stop);
1699 D_ASSERT(thi->task == NULL);
1700 thi->reset_cpu_mask = 1;
Andreas Gruenbachere77a0a52011-01-25 15:43:39 +01001701 thi->t_state = RUNNING;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001702 spin_unlock_irqrestore(&thi->t_lock, flags);
1703 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1704
1705 nt = kthread_create(drbd_thread_setup, (void *) thi,
1706 "drbd%d_%s", mdev_to_minor(mdev), me);
1707
1708 if (IS_ERR(nt)) {
1709 dev_err(DEV, "Couldn't start thread\n");
1710
1711 module_put(THIS_MODULE);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001712 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001713 }
1714 spin_lock_irqsave(&thi->t_lock, flags);
1715 thi->task = nt;
Andreas Gruenbachere77a0a52011-01-25 15:43:39 +01001716 thi->t_state = RUNNING;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001717 spin_unlock_irqrestore(&thi->t_lock, flags);
1718 wake_up_process(nt);
1719 break;
Andreas Gruenbachere77a0a52011-01-25 15:43:39 +01001720 case EXITING:
1721 thi->t_state = RESTARTING;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001722 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1723 me, current->comm, current->pid);
1724 /* fall through */
Andreas Gruenbachere77a0a52011-01-25 15:43:39 +01001725 case RUNNING:
1726 case RESTARTING:
Philipp Reisnerb411b362009-09-25 16:07:19 -07001727 default:
1728 spin_unlock_irqrestore(&thi->t_lock, flags);
1729 break;
1730 }
1731
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001732 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001733}
1734
1735
1736void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1737{
1738 unsigned long flags;
1739
Andreas Gruenbachere77a0a52011-01-25 15:43:39 +01001740 enum drbd_thread_state ns = restart ? RESTARTING : EXITING;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001741
1742 /* may be called from state engine, holding the req lock irqsave */
1743 spin_lock_irqsave(&thi->t_lock, flags);
1744
Andreas Gruenbachere77a0a52011-01-25 15:43:39 +01001745 if (thi->t_state == NONE) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001746 spin_unlock_irqrestore(&thi->t_lock, flags);
1747 if (restart)
1748 drbd_thread_start(thi);
1749 return;
1750 }
1751
1752 if (thi->t_state != ns) {
1753 if (thi->task == NULL) {
1754 spin_unlock_irqrestore(&thi->t_lock, flags);
1755 return;
1756 }
1757
1758 thi->t_state = ns;
1759 smp_mb();
1760 init_completion(&thi->stop);
1761 if (thi->task != current)
1762 force_sig(DRBD_SIGKILL, thi->task);
1763
1764 }
1765
1766 spin_unlock_irqrestore(&thi->t_lock, flags);
1767
1768 if (wait)
1769 wait_for_completion(&thi->stop);
1770}
1771
1772#ifdef CONFIG_SMP
1773/**
1774 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1775 * @mdev: DRBD device.
1776 *
1777 * Forces all threads of a device onto the same CPU. This is beneficial for
1778 * DRBD's performance. May be overwritten by user's configuration.
1779 */
1780void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1781{
1782 int ord, cpu;
1783
1784 /* user override. */
1785 if (cpumask_weight(mdev->cpu_mask))
1786 return;
1787
1788 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1789 for_each_online_cpu(cpu) {
1790 if (ord-- == 0) {
1791 cpumask_set_cpu(cpu, mdev->cpu_mask);
1792 return;
1793 }
1794 }
1795 /* should not be reached */
1796 cpumask_setall(mdev->cpu_mask);
1797}
1798
1799/**
1800 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1801 * @mdev: DRBD device.
1802 *
1803 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1804 * prematurely.
1805 */
1806void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1807{
1808 struct task_struct *p = current;
1809 struct drbd_thread *thi =
Philipp Reisnere6b3ea82011-01-19 14:02:01 +01001810 p == mdev->tconn->asender.task ? &mdev->tconn->asender :
1811 p == mdev->tconn->receiver.task ? &mdev->tconn->receiver :
1812 p == mdev->tconn->worker.task ? &mdev->tconn->worker :
Philipp Reisnerb411b362009-09-25 16:07:19 -07001813 NULL;
Andreas Gruenbacher841ce242010-12-15 19:31:20 +01001814 if (!expect(thi != NULL))
Philipp Reisnerb411b362009-09-25 16:07:19 -07001815 return;
1816 if (!thi->reset_cpu_mask)
1817 return;
1818 thi->reset_cpu_mask = 0;
1819 set_cpus_allowed_ptr(p, mdev->cpu_mask);
1820}
1821#endif
1822
Philipp Reisnerfd340c12011-01-19 16:57:39 +01001823static void prepare_header80(struct drbd_conf *mdev, struct p_header80 *h,
Andreas Gruenbacherd8763022011-01-26 17:39:41 +01001824 enum drbd_packet cmd, int size)
Philipp Reisnerfd340c12011-01-19 16:57:39 +01001825{
1826 h->magic = cpu_to_be32(DRBD_MAGIC);
1827 h->command = cpu_to_be16(cmd);
1828 h->length = cpu_to_be16(size);
1829}
1830
1831static void prepare_header95(struct drbd_conf *mdev, struct p_header95 *h,
Andreas Gruenbacherd8763022011-01-26 17:39:41 +01001832 enum drbd_packet cmd, int size)
Philipp Reisnerfd340c12011-01-19 16:57:39 +01001833{
1834 h->magic = cpu_to_be16(DRBD_MAGIC_BIG);
1835 h->command = cpu_to_be16(cmd);
1836 h->length = cpu_to_be32(size);
1837}
1838
1839static void prepare_header(struct drbd_conf *mdev, struct p_header *h,
Andreas Gruenbacherd8763022011-01-26 17:39:41 +01001840 enum drbd_packet cmd, int size)
Philipp Reisnerfd340c12011-01-19 16:57:39 +01001841{
1842 if (mdev->tconn->agreed_pro_version >= 100 || size > DRBD_MAX_SIZE_H80_PACKET)
1843 prepare_header95(mdev, &h->h95, cmd, size);
1844 else
1845 prepare_header80(mdev, &h->h80, cmd, size);
1846}
1847
Philipp Reisnerb411b362009-09-25 16:07:19 -07001848/* the appropriate socket mutex must be held already */
1849int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
Andreas Gruenbacherd8763022011-01-26 17:39:41 +01001850 enum drbd_packet cmd, struct p_header *h, size_t size,
1851 unsigned msg_flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001852{
1853 int sent, ok;
1854
Andreas Gruenbacher841ce242010-12-15 19:31:20 +01001855 if (!expect(h))
1856 return false;
1857 if (!expect(size))
1858 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001859
Philipp Reisnerfd340c12011-01-19 16:57:39 +01001860 prepare_header(mdev, h, cmd, size - sizeof(struct p_header));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001861
Philipp Reisnerb411b362009-09-25 16:07:19 -07001862 sent = drbd_send(mdev, sock, h, size, msg_flags);
1863
1864 ok = (sent == size);
Lars Ellenberg0ddc5542011-01-21 12:35:15 +01001865 if (!ok && !signal_pending(current))
1866 dev_warn(DEV, "short sent %s size=%d sent=%d\n",
Philipp Reisnerb411b362009-09-25 16:07:19 -07001867 cmdname(cmd), (int)size, sent);
1868 return ok;
1869}
1870
1871/* don't pass the socket. we may only look at it
1872 * when we hold the appropriate socket mutex.
1873 */
1874int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
Andreas Gruenbacherd8763022011-01-26 17:39:41 +01001875 enum drbd_packet cmd, struct p_header *h, size_t size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001876{
1877 int ok = 0;
1878 struct socket *sock;
1879
1880 if (use_data_socket) {
Philipp Reisnere42325a2011-01-19 13:55:45 +01001881 mutex_lock(&mdev->tconn->data.mutex);
1882 sock = mdev->tconn->data.socket;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001883 } else {
Philipp Reisnere42325a2011-01-19 13:55:45 +01001884 mutex_lock(&mdev->tconn->meta.mutex);
1885 sock = mdev->tconn->meta.socket;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001886 }
1887
1888 /* drbd_disconnect() could have called drbd_free_sock()
1889 * while we were waiting in down()... */
1890 if (likely(sock != NULL))
1891 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1892
1893 if (use_data_socket)
Philipp Reisnere42325a2011-01-19 13:55:45 +01001894 mutex_unlock(&mdev->tconn->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001895 else
Philipp Reisnere42325a2011-01-19 13:55:45 +01001896 mutex_unlock(&mdev->tconn->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001897 return ok;
1898}
1899
Andreas Gruenbacherd8763022011-01-26 17:39:41 +01001900int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packet cmd, char *data,
Philipp Reisnerb411b362009-09-25 16:07:19 -07001901 size_t size)
1902{
Philipp Reisnerfd340c12011-01-19 16:57:39 +01001903 struct p_header h;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001904 int ok;
1905
Philipp Reisnerfd340c12011-01-19 16:57:39 +01001906 prepare_header(mdev, &h, cmd, size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001907
1908 if (!drbd_get_data_sock(mdev))
1909 return 0;
1910
Philipp Reisnerb411b362009-09-25 16:07:19 -07001911 ok = (sizeof(h) ==
Philipp Reisnere42325a2011-01-19 13:55:45 +01001912 drbd_send(mdev, mdev->tconn->data.socket, &h, sizeof(h), 0));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001913 ok = ok && (size ==
Philipp Reisnere42325a2011-01-19 13:55:45 +01001914 drbd_send(mdev, mdev->tconn->data.socket, data, size, 0));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001915
1916 drbd_put_data_sock(mdev);
1917
1918 return ok;
1919}
1920
1921int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1922{
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001923 struct p_rs_param_95 *p;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001924 struct socket *sock;
1925 int size, rv;
Philipp Reisner31890f42011-01-19 14:12:51 +01001926 const int apv = mdev->tconn->agreed_pro_version;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001927
1928 size = apv <= 87 ? sizeof(struct p_rs_param)
1929 : apv == 88 ? sizeof(struct p_rs_param)
1930 + strlen(mdev->sync_conf.verify_alg) + 1
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001931 : apv <= 94 ? sizeof(struct p_rs_param_89)
1932 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001933
1934 /* used from admin command context and receiver/worker context.
1935 * to avoid kmalloc, grab the socket right here,
1936 * then use the pre-allocated sbuf there */
Philipp Reisnere42325a2011-01-19 13:55:45 +01001937 mutex_lock(&mdev->tconn->data.mutex);
1938 sock = mdev->tconn->data.socket;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001939
1940 if (likely(sock != NULL)) {
Andreas Gruenbacherd8763022011-01-26 17:39:41 +01001941 enum drbd_packet cmd =
1942 apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001943
Philipp Reisnere42325a2011-01-19 13:55:45 +01001944 p = &mdev->tconn->data.sbuf.rs_param_95;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001945
1946 /* initialize verify_alg and csums_alg */
1947 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1948
1949 p->rate = cpu_to_be32(sc->rate);
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001950 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
1951 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
1952 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
1953 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001954
1955 if (apv >= 88)
1956 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1957 if (apv >= 89)
1958 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1959
1960 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1961 } else
1962 rv = 0; /* not ok */
1963
Philipp Reisnere42325a2011-01-19 13:55:45 +01001964 mutex_unlock(&mdev->tconn->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001965
1966 return rv;
1967}
1968
1969int drbd_send_protocol(struct drbd_conf *mdev)
1970{
1971 struct p_protocol *p;
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001972 int size, cf, rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001973
1974 size = sizeof(struct p_protocol);
1975
Philipp Reisner31890f42011-01-19 14:12:51 +01001976 if (mdev->tconn->agreed_pro_version >= 87)
Philipp Reisner89e58e72011-01-19 13:12:45 +01001977 size += strlen(mdev->tconn->net_conf->integrity_alg) + 1;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001978
1979 /* we must not recurse into our own queue,
1980 * as that is blocked during handshake */
1981 p = kmalloc(size, GFP_NOIO);
1982 if (p == NULL)
1983 return 0;
1984
Philipp Reisner89e58e72011-01-19 13:12:45 +01001985 p->protocol = cpu_to_be32(mdev->tconn->net_conf->wire_protocol);
1986 p->after_sb_0p = cpu_to_be32(mdev->tconn->net_conf->after_sb_0p);
1987 p->after_sb_1p = cpu_to_be32(mdev->tconn->net_conf->after_sb_1p);
1988 p->after_sb_2p = cpu_to_be32(mdev->tconn->net_conf->after_sb_2p);
1989 p->two_primaries = cpu_to_be32(mdev->tconn->net_conf->two_primaries);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001990
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001991 cf = 0;
Philipp Reisner89e58e72011-01-19 13:12:45 +01001992 if (mdev->tconn->net_conf->want_lose)
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001993 cf |= CF_WANT_LOSE;
Philipp Reisner89e58e72011-01-19 13:12:45 +01001994 if (mdev->tconn->net_conf->dry_run) {
Philipp Reisner31890f42011-01-19 14:12:51 +01001995 if (mdev->tconn->agreed_pro_version >= 92)
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001996 cf |= CF_DRY_RUN;
1997 else {
1998 dev_err(DEV, "--dry-run is not supported by peer");
Dan Carpenter7ac314c2010-04-22 14:27:23 +02001999 kfree(p);
Philipp Reisner148efa12011-01-15 00:21:15 +01002000 return -1;
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01002001 }
2002 }
2003 p->conn_flags = cpu_to_be32(cf);
2004
Philipp Reisner31890f42011-01-19 14:12:51 +01002005 if (mdev->tconn->agreed_pro_version >= 87)
Philipp Reisner89e58e72011-01-19 13:12:45 +01002006 strcpy(p->integrity_alg, mdev->tconn->net_conf->integrity_alg);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002007
Philipp Reisnerc0129492011-01-19 16:58:16 +01002008 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL, &p->head, size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002009 kfree(p);
2010 return rv;
2011}
2012
2013int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
2014{
2015 struct p_uuids p;
2016 int i;
2017
2018 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
2019 return 1;
2020
2021 for (i = UI_CURRENT; i < UI_SIZE; i++)
2022 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
2023
2024 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
2025 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
Philipp Reisner89e58e72011-01-19 13:12:45 +01002026 uuid_flags |= mdev->tconn->net_conf->want_lose ? 1 : 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002027 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
2028 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
2029 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
2030
2031 put_ldev(mdev);
2032
Philipp Reisnerc0129492011-01-19 16:58:16 +01002033 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS, &p.head, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002034}
2035
2036int drbd_send_uuids(struct drbd_conf *mdev)
2037{
2038 return _drbd_send_uuids(mdev, 0);
2039}
2040
2041int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
2042{
2043 return _drbd_send_uuids(mdev, 8);
2044}
2045
Lars Ellenberg62b0da32011-01-20 13:25:21 +01002046void drbd_print_uuids(struct drbd_conf *mdev, const char *text)
2047{
2048 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2049 u64 *uuid = mdev->ldev->md.uuid;
2050 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX\n",
2051 text,
2052 (unsigned long long)uuid[UI_CURRENT],
2053 (unsigned long long)uuid[UI_BITMAP],
2054 (unsigned long long)uuid[UI_HISTORY_START],
2055 (unsigned long long)uuid[UI_HISTORY_END]);
2056 put_ldev(mdev);
2057 } else {
2058 dev_info(DEV, "%s effective data uuid: %016llX\n",
2059 text,
2060 (unsigned long long)mdev->ed_uuid);
2061 }
2062}
2063
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002064int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002065{
2066 struct p_rs_uuid p;
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002067 u64 uuid;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002068
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002069 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
2070
Philipp Reisner4a23f262011-01-11 17:42:17 +01002071 uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET;
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002072 drbd_uuid_set(mdev, UI_BITMAP, uuid);
Lars Ellenberg62b0da32011-01-20 13:25:21 +01002073 drbd_print_uuids(mdev, "updated sync UUID");
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002074 drbd_md_sync(mdev);
2075 p.uuid = cpu_to_be64(uuid);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002076
Philipp Reisnerc0129492011-01-19 16:58:16 +01002077 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID, &p.head, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002078}
2079
Philipp Reisnere89b5912010-03-24 17:11:33 +01002080int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002081{
2082 struct p_sizes p;
2083 sector_t d_size, u_size;
Philipp Reisner99432fc2011-05-20 16:39:13 +02002084 int q_order_type, max_bio_size;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002085 int ok;
2086
2087 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2088 D_ASSERT(mdev->ldev->backing_bdev);
2089 d_size = drbd_get_max_capacity(mdev->ldev);
2090 u_size = mdev->ldev->dc.disk_size;
2091 q_order_type = drbd_queue_order_type(mdev);
Philipp Reisner99432fc2011-05-20 16:39:13 +02002092 max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
2093 max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002094 put_ldev(mdev);
2095 } else {
2096 d_size = 0;
2097 u_size = 0;
2098 q_order_type = QUEUE_ORDERED_NONE;
Philipp Reisner99432fc2011-05-20 16:39:13 +02002099 max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002100 }
2101
2102 p.d_size = cpu_to_be64(d_size);
2103 p.u_size = cpu_to_be64(u_size);
2104 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
Philipp Reisner99432fc2011-05-20 16:39:13 +02002105 p.max_bio_size = cpu_to_be32(max_bio_size);
Philipp Reisnere89b5912010-03-24 17:11:33 +01002106 p.queue_order_type = cpu_to_be16(q_order_type);
2107 p.dds_flags = cpu_to_be16(flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002108
Philipp Reisnerc0129492011-01-19 16:58:16 +01002109 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES, &p.head, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002110 return ok;
2111}
2112
2113/**
2114 * drbd_send_state() - Sends the drbd state to the peer
2115 * @mdev: DRBD device.
2116 */
2117int drbd_send_state(struct drbd_conf *mdev)
2118{
2119 struct socket *sock;
2120 struct p_state p;
2121 int ok = 0;
2122
2123 /* Grab state lock so we wont send state if we're in the middle
2124 * of a cluster wide state change on another thread */
2125 drbd_state_lock(mdev);
2126
Philipp Reisnere42325a2011-01-19 13:55:45 +01002127 mutex_lock(&mdev->tconn->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002128
2129 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
Philipp Reisnere42325a2011-01-19 13:55:45 +01002130 sock = mdev->tconn->data.socket;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002131
2132 if (likely(sock != NULL)) {
Philipp Reisnerc0129492011-01-19 16:58:16 +01002133 ok = _drbd_send_cmd(mdev, sock, P_STATE, &p.head, sizeof(p), 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002134 }
2135
Philipp Reisnere42325a2011-01-19 13:55:45 +01002136 mutex_unlock(&mdev->tconn->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002137
2138 drbd_state_unlock(mdev);
2139 return ok;
2140}
2141
2142int drbd_send_state_req(struct drbd_conf *mdev,
2143 union drbd_state mask, union drbd_state val)
2144{
2145 struct p_req_state p;
2146
2147 p.mask = cpu_to_be32(mask.i);
2148 p.val = cpu_to_be32(val.i);
2149
Philipp Reisnerc0129492011-01-19 16:58:16 +01002150 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ, &p.head, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002151}
2152
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +01002153int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002154{
2155 struct p_req_state_reply p;
2156
2157 p.retcode = cpu_to_be32(retcode);
2158
Philipp Reisnerc0129492011-01-19 16:58:16 +01002159 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY, &p.head, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002160}
2161
2162int fill_bitmap_rle_bits(struct drbd_conf *mdev,
2163 struct p_compressed_bm *p,
2164 struct bm_xfer_ctx *c)
2165{
2166 struct bitstream bs;
2167 unsigned long plain_bits;
2168 unsigned long tmp;
2169 unsigned long rl;
2170 unsigned len;
2171 unsigned toggle;
2172 int bits;
2173
2174 /* may we use this feature? */
2175 if ((mdev->sync_conf.use_rle == 0) ||
Philipp Reisner31890f42011-01-19 14:12:51 +01002176 (mdev->tconn->agreed_pro_version < 90))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002177 return 0;
2178
2179 if (c->bit_offset >= c->bm_bits)
2180 return 0; /* nothing to do. */
2181
2182 /* use at most thus many bytes */
2183 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
2184 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
2185 /* plain bits covered in this code string */
2186 plain_bits = 0;
2187
2188 /* p->encoding & 0x80 stores whether the first run length is set.
2189 * bit offset is implicit.
2190 * start with toggle == 2 to be able to tell the first iteration */
2191 toggle = 2;
2192
2193 /* see how much plain bits we can stuff into one packet
2194 * using RLE and VLI. */
2195 do {
2196 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
2197 : _drbd_bm_find_next(mdev, c->bit_offset);
2198 if (tmp == -1UL)
2199 tmp = c->bm_bits;
2200 rl = tmp - c->bit_offset;
2201
2202 if (toggle == 2) { /* first iteration */
2203 if (rl == 0) {
2204 /* the first checked bit was set,
2205 * store start value, */
2206 DCBP_set_start(p, 1);
2207 /* but skip encoding of zero run length */
2208 toggle = !toggle;
2209 continue;
2210 }
2211 DCBP_set_start(p, 0);
2212 }
2213
2214 /* paranoia: catch zero runlength.
2215 * can only happen if bitmap is modified while we scan it. */
2216 if (rl == 0) {
2217 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
2218 "t:%u bo:%lu\n", toggle, c->bit_offset);
2219 return -1;
2220 }
2221
2222 bits = vli_encode_bits(&bs, rl);
2223 if (bits == -ENOBUFS) /* buffer full */
2224 break;
2225 if (bits <= 0) {
2226 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
2227 return 0;
2228 }
2229
2230 toggle = !toggle;
2231 plain_bits += rl;
2232 c->bit_offset = tmp;
2233 } while (c->bit_offset < c->bm_bits);
2234
2235 len = bs.cur.b - p->code + !!bs.cur.bit;
2236
2237 if (plain_bits < (len << 3)) {
2238 /* incompressible with this method.
2239 * we need to rewind both word and bit position. */
2240 c->bit_offset -= plain_bits;
2241 bm_xfer_ctx_bit_to_word_offset(c);
2242 c->bit_offset = c->word_offset * BITS_PER_LONG;
2243 return 0;
2244 }
2245
2246 /* RLE + VLI was able to compress it just fine.
2247 * update c->word_offset. */
2248 bm_xfer_ctx_bit_to_word_offset(c);
2249
2250 /* store pad_bits */
2251 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2252
2253 return len;
2254}
2255
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002256/**
2257 * send_bitmap_rle_or_plain
2258 *
2259 * Return 0 when done, 1 when another iteration is needed, and a negative error
2260 * code upon failure.
2261 */
2262static int
Philipp Reisnerb411b362009-09-25 16:07:19 -07002263send_bitmap_rle_or_plain(struct drbd_conf *mdev,
Philipp Reisnerc0129492011-01-19 16:58:16 +01002264 struct p_header *h, struct bm_xfer_ctx *c)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002265{
2266 struct p_compressed_bm *p = (void*)h;
2267 unsigned long num_words;
2268 int len;
2269 int ok;
2270
2271 len = fill_bitmap_rle_bits(mdev, p, c);
2272
2273 if (len < 0)
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002274 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002275
2276 if (len) {
2277 DCBP_set_code(p, RLE_VLI_Bits);
Philipp Reisnere42325a2011-01-19 13:55:45 +01002278 ok = _drbd_send_cmd(mdev, mdev->tconn->data.socket, P_COMPRESSED_BITMAP, h,
Philipp Reisnerb411b362009-09-25 16:07:19 -07002279 sizeof(*p) + len, 0);
2280
2281 c->packets[0]++;
2282 c->bytes[0] += sizeof(*p) + len;
2283
2284 if (c->bit_offset >= c->bm_bits)
2285 len = 0; /* DONE */
2286 } else {
2287 /* was not compressible.
2288 * send a buffer full of plain text bits instead. */
2289 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2290 len = num_words * sizeof(long);
2291 if (len)
2292 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
Philipp Reisnere42325a2011-01-19 13:55:45 +01002293 ok = _drbd_send_cmd(mdev, mdev->tconn->data.socket, P_BITMAP,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002294 h, sizeof(struct p_header80) + len, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002295 c->word_offset += num_words;
2296 c->bit_offset = c->word_offset * BITS_PER_LONG;
2297
2298 c->packets[1]++;
Philipp Reisner0b70a132010-08-20 13:36:10 +02002299 c->bytes[1] += sizeof(struct p_header80) + len;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002300
2301 if (c->bit_offset > c->bm_bits)
2302 c->bit_offset = c->bm_bits;
2303 }
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002304 if (ok) {
2305 if (len == 0) {
2306 INFO_bm_xfer_stats(mdev, "send", c);
2307 return 0;
2308 } else
2309 return 1;
2310 }
2311 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002312}
2313
2314/* See the comment at receive_bitmap() */
2315int _drbd_send_bitmap(struct drbd_conf *mdev)
2316{
2317 struct bm_xfer_ctx c;
Philipp Reisnerc0129492011-01-19 16:58:16 +01002318 struct p_header *p;
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002319 int err;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002320
Andreas Gruenbacher841ce242010-12-15 19:31:20 +01002321 if (!expect(mdev->bitmap))
2322 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002323
2324 /* maybe we should use some per thread scratch page,
2325 * and allocate that during initial device creation? */
Philipp Reisnerc0129492011-01-19 16:58:16 +01002326 p = (struct p_header *) __get_free_page(GFP_NOIO);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002327 if (!p) {
2328 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002329 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002330 }
2331
2332 if (get_ldev(mdev)) {
2333 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2334 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2335 drbd_bm_set_all(mdev);
2336 if (drbd_bm_write(mdev)) {
2337 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2338 * but otherwise process as per normal - need to tell other
2339 * side that a full resync is required! */
2340 dev_err(DEV, "Failed to write bitmap to disk!\n");
2341 } else {
2342 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2343 drbd_md_sync(mdev);
2344 }
2345 }
2346 put_ldev(mdev);
2347 }
2348
2349 c = (struct bm_xfer_ctx) {
2350 .bm_bits = drbd_bm_bits(mdev),
2351 .bm_words = drbd_bm_words(mdev),
2352 };
2353
2354 do {
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002355 err = send_bitmap_rle_or_plain(mdev, p, &c);
2356 } while (err > 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002357
2358 free_page((unsigned long) p);
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002359 return err == 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002360}
2361
2362int drbd_send_bitmap(struct drbd_conf *mdev)
2363{
2364 int err;
2365
2366 if (!drbd_get_data_sock(mdev))
2367 return -1;
2368 err = !_drbd_send_bitmap(mdev);
2369 drbd_put_data_sock(mdev);
2370 return err;
2371}
2372
2373int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2374{
2375 int ok;
2376 struct p_barrier_ack p;
2377
2378 p.barrier = barrier_nr;
2379 p.set_size = cpu_to_be32(set_size);
2380
2381 if (mdev->state.conn < C_CONNECTED)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002382 return false;
Philipp Reisnerc0129492011-01-19 16:58:16 +01002383 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK, &p.head, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002384 return ok;
2385}
2386
2387/**
2388 * _drbd_send_ack() - Sends an ack packet
2389 * @mdev: DRBD device.
2390 * @cmd: Packet command code.
2391 * @sector: sector, needs to be in big endian byte order
2392 * @blksize: size in byte, needs to be in big endian byte order
2393 * @block_id: Id, big endian byte order
2394 */
Andreas Gruenbacherd8763022011-01-26 17:39:41 +01002395static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packet cmd,
2396 u64 sector, u32 blksize, u64 block_id)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002397{
2398 int ok;
2399 struct p_block_ack p;
2400
2401 p.sector = sector;
2402 p.block_id = block_id;
2403 p.blksize = blksize;
2404 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2405
Philipp Reisnere42325a2011-01-19 13:55:45 +01002406 if (!mdev->tconn->meta.socket || mdev->state.conn < C_CONNECTED)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002407 return false;
Philipp Reisnerc0129492011-01-19 16:58:16 +01002408 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd, &p.head, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002409 return ok;
2410}
2411
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002412/* dp->sector and dp->block_id already/still in network byte order,
2413 * data_size is payload size according to dp->head,
2414 * and may need to be corrected for digest size. */
Andreas Gruenbacherd8763022011-01-26 17:39:41 +01002415int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packet cmd,
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002416 struct p_data *dp, int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002417{
Philipp Reisnera0638452011-01-19 14:31:32 +01002418 data_size -= (mdev->tconn->agreed_pro_version >= 87 && mdev->tconn->integrity_r_tfm) ?
2419 crypto_hash_digestsize(mdev->tconn->integrity_r_tfm) : 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002420 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2421 dp->block_id);
2422}
2423
Andreas Gruenbacherd8763022011-01-26 17:39:41 +01002424int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packet cmd,
Philipp Reisnerb411b362009-09-25 16:07:19 -07002425 struct p_block_req *rp)
2426{
2427 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2428}
2429
2430/**
2431 * drbd_send_ack() - Sends an ack packet
Andreas Gruenbacherdb830c42011-02-04 15:57:48 +01002432 * @mdev: DRBD device
2433 * @cmd: packet command code
2434 * @peer_req: peer request
Philipp Reisnerb411b362009-09-25 16:07:19 -07002435 */
Andreas Gruenbacherd8763022011-01-26 17:39:41 +01002436int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packet cmd,
Andreas Gruenbacherdb830c42011-02-04 15:57:48 +01002437 struct drbd_peer_request *peer_req)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002438{
2439 return _drbd_send_ack(mdev, cmd,
Andreas Gruenbacherdb830c42011-02-04 15:57:48 +01002440 cpu_to_be64(peer_req->i.sector),
2441 cpu_to_be32(peer_req->i.size),
2442 peer_req->block_id);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002443}
2444
2445/* This function misuses the block_id field to signal if the blocks
2446 * are is sync or not. */
Andreas Gruenbacherd8763022011-01-26 17:39:41 +01002447int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packet cmd,
Philipp Reisnerb411b362009-09-25 16:07:19 -07002448 sector_t sector, int blksize, u64 block_id)
2449{
2450 return _drbd_send_ack(mdev, cmd,
2451 cpu_to_be64(sector),
2452 cpu_to_be32(blksize),
2453 cpu_to_be64(block_id));
2454}
2455
2456int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2457 sector_t sector, int size, u64 block_id)
2458{
2459 int ok;
2460 struct p_block_req p;
2461
2462 p.sector = cpu_to_be64(sector);
2463 p.block_id = block_id;
2464 p.blksize = cpu_to_be32(size);
2465
Philipp Reisnerc0129492011-01-19 16:58:16 +01002466 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &p.head, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002467 return ok;
2468}
2469
Andreas Gruenbacherd8763022011-01-26 17:39:41 +01002470int drbd_send_drequest_csum(struct drbd_conf *mdev, sector_t sector, int size,
2471 void *digest, int digest_size, enum drbd_packet cmd)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002472{
2473 int ok;
2474 struct p_block_req p;
2475
Philipp Reisnerfd340c12011-01-19 16:57:39 +01002476 prepare_header(mdev, &p.head, cmd, sizeof(p) - sizeof(struct p_header) + digest_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002477 p.sector = cpu_to_be64(sector);
Andreas Gruenbacher9a8e7752011-01-11 14:04:09 +01002478 p.block_id = ID_SYNCER /* unused */;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002479 p.blksize = cpu_to_be32(size);
2480
Philipp Reisnere42325a2011-01-19 13:55:45 +01002481 mutex_lock(&mdev->tconn->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002482
Philipp Reisnere42325a2011-01-19 13:55:45 +01002483 ok = (sizeof(p) == drbd_send(mdev, mdev->tconn->data.socket, &p, sizeof(p), 0));
2484 ok = ok && (digest_size == drbd_send(mdev, mdev->tconn->data.socket, digest, digest_size, 0));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002485
Philipp Reisnere42325a2011-01-19 13:55:45 +01002486 mutex_unlock(&mdev->tconn->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002487
2488 return ok;
2489}
2490
2491int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2492{
2493 int ok;
2494 struct p_block_req p;
2495
2496 p.sector = cpu_to_be64(sector);
Andreas Gruenbacher9a8e7752011-01-11 14:04:09 +01002497 p.block_id = ID_SYNCER /* unused */;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002498 p.blksize = cpu_to_be32(size);
2499
Philipp Reisnerc0129492011-01-19 16:58:16 +01002500 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST, &p.head, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002501 return ok;
2502}
2503
2504/* called on sndtimeo
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002505 * returns false if we should retry,
2506 * true if we think connection is dead
Philipp Reisnerb411b362009-09-25 16:07:19 -07002507 */
2508static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2509{
2510 int drop_it;
2511 /* long elapsed = (long)(jiffies - mdev->last_received); */
2512
Philipp Reisnere42325a2011-01-19 13:55:45 +01002513 drop_it = mdev->tconn->meta.socket == sock
Philipp Reisnere6b3ea82011-01-19 14:02:01 +01002514 || !mdev->tconn->asender.task
2515 || get_t_state(&mdev->tconn->asender) != RUNNING
Philipp Reisnerb411b362009-09-25 16:07:19 -07002516 || mdev->state.conn < C_CONNECTED;
2517
2518 if (drop_it)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002519 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002520
Philipp Reisner31890f42011-01-19 14:12:51 +01002521 drop_it = !--mdev->tconn->ko_count;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002522 if (!drop_it) {
2523 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
Philipp Reisner31890f42011-01-19 14:12:51 +01002524 current->comm, current->pid, mdev->tconn->ko_count);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002525 request_ping(mdev);
2526 }
2527
2528 return drop_it; /* && (mdev->state == R_PRIMARY) */;
2529}
2530
Andreas Gruenbacher9e204cd2011-01-26 18:45:11 +01002531static void drbd_update_congested(struct drbd_conf *mdev)
2532{
2533 struct sock *sk = mdev->tconn->data.socket->sk;
2534 if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5)
2535 set_bit(NET_CONGESTED, &mdev->flags);
2536}
2537
Philipp Reisnerb411b362009-09-25 16:07:19 -07002538/* The idea of sendpage seems to be to put some kind of reference
2539 * to the page into the skb, and to hand it over to the NIC. In
2540 * this process get_page() gets called.
2541 *
2542 * As soon as the page was really sent over the network put_page()
2543 * gets called by some part of the network layer. [ NIC driver? ]
2544 *
2545 * [ get_page() / put_page() increment/decrement the count. If count
2546 * reaches 0 the page will be freed. ]
2547 *
2548 * This works nicely with pages from FSs.
2549 * But this means that in protocol A we might signal IO completion too early!
2550 *
2551 * In order not to corrupt data during a resync we must make sure
2552 * that we do not reuse our own buffer pages (EEs) to early, therefore
2553 * we have the net_ee list.
2554 *
2555 * XFS seems to have problems, still, it submits pages with page_count == 0!
2556 * As a workaround, we disable sendpage on pages
2557 * with page_count == 0 or PageSlab.
2558 */
2559static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002560 int offset, size_t size, unsigned msg_flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002561{
Philipp Reisnere42325a2011-01-19 13:55:45 +01002562 int sent = drbd_send(mdev, mdev->tconn->data.socket, kmap(page) + offset, size, msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002563 kunmap(page);
2564 if (sent == size)
2565 mdev->send_cnt += size>>9;
2566 return sent == size;
2567}
2568
2569static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002570 int offset, size_t size, unsigned msg_flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002571{
2572 mm_segment_t oldfs = get_fs();
2573 int sent, ok;
2574 int len = size;
2575
2576 /* e.g. XFS meta- & log-data is in slab pages, which have a
2577 * page_count of 0 and/or have PageSlab() set.
2578 * we cannot use send_page for those, as that does get_page();
2579 * put_page(); and would cause either a VM_BUG directly, or
2580 * __page_cache_release a page that would actually still be referenced
2581 * by someone, leading to some obscure delayed Oops somewhere else. */
2582 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002583 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002584
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002585 msg_flags |= MSG_NOSIGNAL;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002586 drbd_update_congested(mdev);
2587 set_fs(KERNEL_DS);
2588 do {
Philipp Reisnere42325a2011-01-19 13:55:45 +01002589 sent = mdev->tconn->data.socket->ops->sendpage(mdev->tconn->data.socket, page,
Philipp Reisnerb411b362009-09-25 16:07:19 -07002590 offset, len,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002591 msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002592 if (sent == -EAGAIN) {
2593 if (we_should_drop_the_connection(mdev,
Philipp Reisnere42325a2011-01-19 13:55:45 +01002594 mdev->tconn->data.socket))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002595 break;
2596 else
2597 continue;
2598 }
2599 if (sent <= 0) {
2600 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2601 __func__, (int)size, len, sent);
2602 break;
2603 }
2604 len -= sent;
2605 offset += sent;
2606 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2607 set_fs(oldfs);
2608 clear_bit(NET_CONGESTED, &mdev->flags);
2609
2610 ok = (len == 0);
2611 if (likely(ok))
2612 mdev->send_cnt += size>>9;
2613 return ok;
2614}
2615
2616static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2617{
2618 struct bio_vec *bvec;
2619 int i;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002620 /* hint all but last page with MSG_MORE */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002621 __bio_for_each_segment(bvec, bio, i, 0) {
2622 if (!_drbd_no_send_page(mdev, bvec->bv_page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002623 bvec->bv_offset, bvec->bv_len,
2624 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002625 return 0;
2626 }
2627 return 1;
2628}
2629
2630static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2631{
2632 struct bio_vec *bvec;
2633 int i;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002634 /* hint all but last page with MSG_MORE */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002635 __bio_for_each_segment(bvec, bio, i, 0) {
2636 if (!_drbd_send_page(mdev, bvec->bv_page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002637 bvec->bv_offset, bvec->bv_len,
2638 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002639 return 0;
2640 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002641 return 1;
2642}
2643
Andreas Gruenbacherdb830c42011-02-04 15:57:48 +01002644static int _drbd_send_zc_ee(struct drbd_conf *mdev,
2645 struct drbd_peer_request *peer_req)
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002646{
Andreas Gruenbacherdb830c42011-02-04 15:57:48 +01002647 struct page *page = peer_req->pages;
2648 unsigned len = peer_req->i.size;
2649
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002650 /* hint all but last page with MSG_MORE */
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002651 page_chain_for_each(page) {
2652 unsigned l = min_t(unsigned, len, PAGE_SIZE);
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002653 if (!_drbd_send_page(mdev, page, 0, l,
2654 page_chain_next(page) ? MSG_MORE : 0))
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002655 return 0;
2656 len -= l;
2657 }
2658 return 1;
2659}
2660
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002661static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2662{
Philipp Reisner31890f42011-01-19 14:12:51 +01002663 if (mdev->tconn->agreed_pro_version >= 95)
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002664 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002665 (bi_rw & REQ_FUA ? DP_FUA : 0) |
2666 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2667 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2668 else
Jens Axboe721a9602011-03-09 11:56:30 +01002669 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002670}
2671
Philipp Reisnerb411b362009-09-25 16:07:19 -07002672/* Used to send write requests
2673 * R_PRIMARY -> Peer (P_DATA)
2674 */
2675int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2676{
2677 int ok = 1;
2678 struct p_data p;
2679 unsigned int dp_flags = 0;
2680 void *dgb;
2681 int dgs;
2682
2683 if (!drbd_get_data_sock(mdev))
2684 return 0;
2685
Philipp Reisnera0638452011-01-19 14:31:32 +01002686 dgs = (mdev->tconn->agreed_pro_version >= 87 && mdev->tconn->integrity_w_tfm) ?
2687 crypto_hash_digestsize(mdev->tconn->integrity_w_tfm) : 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002688
Philipp Reisnerfd340c12011-01-19 16:57:39 +01002689 prepare_header(mdev, &p.head, P_DATA, sizeof(p) - sizeof(struct p_header) + dgs + req->i.size);
Andreas Gruenbacherace652a2011-01-03 17:09:58 +01002690 p.sector = cpu_to_be64(req->i.sector);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002691 p.block_id = (unsigned long)req;
Philipp Reisnerfd340c12011-01-19 16:57:39 +01002692 p.seq_num = cpu_to_be32(req->seq_num = atomic_add_return(1, &mdev->packet_seq));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002693
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002694 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2695
Philipp Reisnerb411b362009-09-25 16:07:19 -07002696 if (mdev->state.conn >= C_SYNC_SOURCE &&
2697 mdev->state.conn <= C_PAUSED_SYNC_T)
2698 dp_flags |= DP_MAY_SET_IN_SYNC;
2699
2700 p.dp_flags = cpu_to_be32(dp_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002701 set_bit(UNPLUG_REMOTE, &mdev->flags);
2702 ok = (sizeof(p) ==
Philipp Reisnere42325a2011-01-19 13:55:45 +01002703 drbd_send(mdev, mdev->tconn->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002704 if (ok && dgs) {
Philipp Reisnera0638452011-01-19 14:31:32 +01002705 dgb = mdev->tconn->int_dig_out;
2706 drbd_csum_bio(mdev, mdev->tconn->integrity_w_tfm, req->master_bio, dgb);
Philipp Reisnere42325a2011-01-19 13:55:45 +01002707 ok = dgs == drbd_send(mdev, mdev->tconn->data.socket, dgb, dgs, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002708 }
2709 if (ok) {
Lars Ellenberg470be442010-11-10 10:36:52 +01002710 /* For protocol A, we have to memcpy the payload into
2711 * socket buffers, as we may complete right away
2712 * as soon as we handed it over to tcp, at which point the data
2713 * pages may become invalid.
2714 *
2715 * For data-integrity enabled, we copy it as well, so we can be
2716 * sure that even if the bio pages may still be modified, it
2717 * won't change the data on the wire, thus if the digest checks
2718 * out ok after sending on this side, but does not fit on the
2719 * receiving side, we sure have detected corruption elsewhere.
2720 */
Philipp Reisner89e58e72011-01-19 13:12:45 +01002721 if (mdev->tconn->net_conf->wire_protocol == DRBD_PROT_A || dgs)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002722 ok = _drbd_send_bio(mdev, req->master_bio);
2723 else
2724 ok = _drbd_send_zc_bio(mdev, req->master_bio);
Lars Ellenberg470be442010-11-10 10:36:52 +01002725
2726 /* double check digest, sometimes buffers have been modified in flight. */
2727 if (dgs > 0 && dgs <= 64) {
Bart Van Assche24c48302011-05-21 18:32:29 +02002728 /* 64 byte, 512 bit, is the largest digest size
Lars Ellenberg470be442010-11-10 10:36:52 +01002729 * currently supported in kernel crypto. */
2730 unsigned char digest[64];
Philipp Reisnera0638452011-01-19 14:31:32 +01002731 drbd_csum_bio(mdev, mdev->tconn->integrity_w_tfm, req->master_bio, digest);
2732 if (memcmp(mdev->tconn->int_dig_out, digest, dgs)) {
Lars Ellenberg470be442010-11-10 10:36:52 +01002733 dev_warn(DEV,
2734 "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
Andreas Gruenbacherace652a2011-01-03 17:09:58 +01002735 (unsigned long long)req->i.sector, req->i.size);
Lars Ellenberg470be442010-11-10 10:36:52 +01002736 }
2737 } /* else if (dgs > 64) {
2738 ... Be noisy about digest too large ...
2739 } */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002740 }
2741
2742 drbd_put_data_sock(mdev);
Philipp Reisnerbd26bfc52010-05-04 12:33:58 +02002743
Philipp Reisnerb411b362009-09-25 16:07:19 -07002744 return ok;
2745}
2746
2747/* answer packet, used to send data back for read requests:
2748 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
2749 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
2750 */
Andreas Gruenbacherd8763022011-01-26 17:39:41 +01002751int drbd_send_block(struct drbd_conf *mdev, enum drbd_packet cmd,
Andreas Gruenbacherdb830c42011-02-04 15:57:48 +01002752 struct drbd_peer_request *peer_req)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002753{
2754 int ok;
2755 struct p_data p;
2756 void *dgb;
2757 int dgs;
2758
Philipp Reisnera0638452011-01-19 14:31:32 +01002759 dgs = (mdev->tconn->agreed_pro_version >= 87 && mdev->tconn->integrity_w_tfm) ?
2760 crypto_hash_digestsize(mdev->tconn->integrity_w_tfm) : 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002761
Andreas Gruenbacherdb830c42011-02-04 15:57:48 +01002762 prepare_header(mdev, &p.head, cmd, sizeof(p) -
2763 sizeof(struct p_header80) +
2764 dgs + peer_req->i.size);
2765 p.sector = cpu_to_be64(peer_req->i.sector);
2766 p.block_id = peer_req->block_id;
Andreas Gruenbachercc378272011-01-26 18:01:50 +01002767 p.seq_num = 0; /* unused */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002768
2769 /* Only called by our kernel thread.
2770 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2771 * in response to admin command or module unload.
2772 */
2773 if (!drbd_get_data_sock(mdev))
2774 return 0;
2775
Philipp Reisnere42325a2011-01-19 13:55:45 +01002776 ok = sizeof(p) == drbd_send(mdev, mdev->tconn->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002777 if (ok && dgs) {
Philipp Reisnera0638452011-01-19 14:31:32 +01002778 dgb = mdev->tconn->int_dig_out;
Andreas Gruenbacherdb830c42011-02-04 15:57:48 +01002779 drbd_csum_ee(mdev, mdev->tconn->integrity_w_tfm, peer_req, dgb);
Philipp Reisnere42325a2011-01-19 13:55:45 +01002780 ok = dgs == drbd_send(mdev, mdev->tconn->data.socket, dgb, dgs, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002781 }
2782 if (ok)
Andreas Gruenbacherdb830c42011-02-04 15:57:48 +01002783 ok = _drbd_send_zc_ee(mdev, peer_req);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002784
2785 drbd_put_data_sock(mdev);
Philipp Reisnerbd26bfc52010-05-04 12:33:58 +02002786
Philipp Reisnerb411b362009-09-25 16:07:19 -07002787 return ok;
2788}
2789
Philipp Reisner73a01a12010-10-27 14:33:00 +02002790int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
2791{
2792 struct p_block_desc p;
2793
Andreas Gruenbacherace652a2011-01-03 17:09:58 +01002794 p.sector = cpu_to_be64(req->i.sector);
2795 p.blksize = cpu_to_be32(req->i.size);
Philipp Reisner73a01a12010-10-27 14:33:00 +02002796
2797 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
2798}
2799
Philipp Reisnerb411b362009-09-25 16:07:19 -07002800/*
2801 drbd_send distinguishes two cases:
2802
2803 Packets sent via the data socket "sock"
2804 and packets sent via the meta data socket "msock"
2805
2806 sock msock
2807 -----------------+-------------------------+------------------------------
2808 timeout conf.timeout / 2 conf.timeout / 2
2809 timeout action send a ping via msock Abort communication
2810 and close all sockets
2811*/
2812
2813/*
2814 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2815 */
2816int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2817 void *buf, size_t size, unsigned msg_flags)
2818{
2819 struct kvec iov;
2820 struct msghdr msg;
2821 int rv, sent = 0;
2822
2823 if (!sock)
2824 return -1000;
2825
2826 /* THINK if (signal_pending) return ... ? */
2827
2828 iov.iov_base = buf;
2829 iov.iov_len = size;
2830
2831 msg.msg_name = NULL;
2832 msg.msg_namelen = 0;
2833 msg.msg_control = NULL;
2834 msg.msg_controllen = 0;
2835 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
2836
Philipp Reisnere42325a2011-01-19 13:55:45 +01002837 if (sock == mdev->tconn->data.socket) {
Philipp Reisner31890f42011-01-19 14:12:51 +01002838 mdev->tconn->ko_count = mdev->tconn->net_conf->ko_count;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002839 drbd_update_congested(mdev);
2840 }
2841 do {
2842 /* STRANGE
2843 * tcp_sendmsg does _not_ use its size parameter at all ?
2844 *
2845 * -EAGAIN on timeout, -EINTR on signal.
2846 */
2847/* THINK
2848 * do we need to block DRBD_SIG if sock == &meta.socket ??
2849 * otherwise wake_asender() might interrupt some send_*Ack !
2850 */
2851 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2852 if (rv == -EAGAIN) {
2853 if (we_should_drop_the_connection(mdev, sock))
2854 break;
2855 else
2856 continue;
2857 }
2858 D_ASSERT(rv != 0);
2859 if (rv == -EINTR) {
2860 flush_signals(current);
2861 rv = 0;
2862 }
2863 if (rv < 0)
2864 break;
2865 sent += rv;
2866 iov.iov_base += rv;
2867 iov.iov_len -= rv;
2868 } while (sent < size);
2869
Philipp Reisnere42325a2011-01-19 13:55:45 +01002870 if (sock == mdev->tconn->data.socket)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002871 clear_bit(NET_CONGESTED, &mdev->flags);
2872
2873 if (rv <= 0) {
2874 if (rv != -EAGAIN) {
2875 dev_err(DEV, "%s_sendmsg returned %d\n",
Philipp Reisnere42325a2011-01-19 13:55:45 +01002876 sock == mdev->tconn->meta.socket ? "msock" : "sock",
Philipp Reisnerb411b362009-09-25 16:07:19 -07002877 rv);
2878 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2879 } else
2880 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2881 }
2882
2883 return sent;
2884}
2885
2886static int drbd_open(struct block_device *bdev, fmode_t mode)
2887{
2888 struct drbd_conf *mdev = bdev->bd_disk->private_data;
2889 unsigned long flags;
2890 int rv = 0;
2891
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002892 mutex_lock(&drbd_main_mutex);
Philipp Reisner87eeee42011-01-19 14:16:30 +01002893 spin_lock_irqsave(&mdev->tconn->req_lock, flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002894 /* to have a stable mdev->state.role
2895 * and no race with updating open_cnt */
2896
2897 if (mdev->state.role != R_PRIMARY) {
2898 if (mode & FMODE_WRITE)
2899 rv = -EROFS;
2900 else if (!allow_oos)
2901 rv = -EMEDIUMTYPE;
2902 }
2903
2904 if (!rv)
2905 mdev->open_cnt++;
Philipp Reisner87eeee42011-01-19 14:16:30 +01002906 spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002907 mutex_unlock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002908
2909 return rv;
2910}
2911
2912static int drbd_release(struct gendisk *gd, fmode_t mode)
2913{
2914 struct drbd_conf *mdev = gd->private_data;
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002915 mutex_lock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002916 mdev->open_cnt--;
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002917 mutex_unlock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002918 return 0;
2919}
2920
Philipp Reisnerb411b362009-09-25 16:07:19 -07002921static void drbd_set_defaults(struct drbd_conf *mdev)
2922{
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002923 /* This way we get a compile error when sync_conf grows,
2924 and we forgot to initialize it here */
2925 mdev->sync_conf = (struct syncer_conf) {
2926 /* .rate = */ DRBD_RATE_DEF,
2927 /* .after = */ DRBD_AFTER_DEF,
2928 /* .al_extents = */ DRBD_AL_EXTENTS_DEF,
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002929 /* .verify_alg = */ {}, 0,
2930 /* .cpu_mask = */ {}, 0,
2931 /* .csums_alg = */ {}, 0,
Philipp Reisnere7564142010-06-29 17:35:34 +02002932 /* .use_rle = */ 0,
Philipp Reisner9a31d712010-07-05 13:42:03 +02002933 /* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
2934 /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
2935 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
2936 /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02002937 /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
2938 /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002939 };
2940
2941 /* Have to use that way, because the layout differs between
2942 big endian and little endian */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002943 mdev->state = (union drbd_state) {
2944 { .role = R_SECONDARY,
2945 .peer = R_UNKNOWN,
2946 .conn = C_STANDALONE,
2947 .disk = D_DISKLESS,
2948 .pdsk = D_UNKNOWN,
Philipp Reisnerfb22c402010-09-08 23:20:21 +02002949 .susp = 0,
2950 .susp_nod = 0,
2951 .susp_fen = 0
Philipp Reisnerb411b362009-09-25 16:07:19 -07002952 } };
2953}
2954
2955void drbd_init_set_defaults(struct drbd_conf *mdev)
2956{
2957 /* the memset(,0,) did most of this.
2958 * note: only assignments, no allocation in here */
2959
2960 drbd_set_defaults(mdev);
2961
Philipp Reisnerb411b362009-09-25 16:07:19 -07002962 atomic_set(&mdev->ap_bio_cnt, 0);
2963 atomic_set(&mdev->ap_pending_cnt, 0);
2964 atomic_set(&mdev->rs_pending_cnt, 0);
2965 atomic_set(&mdev->unacked_cnt, 0);
2966 atomic_set(&mdev->local_cnt, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002967 atomic_set(&mdev->pp_in_use, 0);
Lars Ellenberg435f0742010-09-06 12:30:25 +02002968 atomic_set(&mdev->pp_in_use_by_net, 0);
Philipp Reisner778f2712010-07-06 11:14:00 +02002969 atomic_set(&mdev->rs_sect_in, 0);
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02002970 atomic_set(&mdev->rs_sect_ev, 0);
Philipp Reisner759fbdf2010-10-26 16:02:27 +02002971 atomic_set(&mdev->ap_in_flight, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002972
2973 mutex_init(&mdev->md_io_mutex);
Philipp Reisnere42325a2011-01-19 13:55:45 +01002974 mutex_init(&mdev->tconn->data.mutex);
2975 mutex_init(&mdev->tconn->meta.mutex);
2976 sema_init(&mdev->tconn->data.work.s, 0);
2977 sema_init(&mdev->tconn->meta.work.s, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002978 mutex_init(&mdev->state_mutex);
2979
Philipp Reisnere42325a2011-01-19 13:55:45 +01002980 spin_lock_init(&mdev->tconn->data.work.q_lock);
2981 spin_lock_init(&mdev->tconn->meta.work.q_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002982
2983 spin_lock_init(&mdev->al_lock);
Philipp Reisner87eeee42011-01-19 14:16:30 +01002984 spin_lock_init(&mdev->tconn->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002985 spin_lock_init(&mdev->peer_seq_lock);
2986 spin_lock_init(&mdev->epoch_lock);
2987
2988 INIT_LIST_HEAD(&mdev->active_ee);
2989 INIT_LIST_HEAD(&mdev->sync_ee);
2990 INIT_LIST_HEAD(&mdev->done_ee);
2991 INIT_LIST_HEAD(&mdev->read_ee);
2992 INIT_LIST_HEAD(&mdev->net_ee);
2993 INIT_LIST_HEAD(&mdev->resync_reads);
Philipp Reisnere42325a2011-01-19 13:55:45 +01002994 INIT_LIST_HEAD(&mdev->tconn->data.work.q);
2995 INIT_LIST_HEAD(&mdev->tconn->meta.work.q);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002996 INIT_LIST_HEAD(&mdev->resync_work.list);
2997 INIT_LIST_HEAD(&mdev->unplug_work.list);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02002998 INIT_LIST_HEAD(&mdev->go_diskless.list);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002999 INIT_LIST_HEAD(&mdev->md_sync_work.list);
Philipp Reisnerc4752ef2010-10-27 17:32:36 +02003000 INIT_LIST_HEAD(&mdev->start_resync_work.list);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003001 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
Philipp Reisner0ced55a2010-04-30 15:26:20 +02003002
Philipp Reisner794abb72010-12-27 11:51:23 +01003003 mdev->resync_work.cb = w_resync_timer;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003004 mdev->unplug_work.cb = w_send_write_hint;
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003005 mdev->go_diskless.cb = w_go_diskless;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003006 mdev->md_sync_work.cb = w_md_sync;
3007 mdev->bm_io_work.w.cb = w_bitmap_io;
Philipp Reisner370a43e2011-01-14 16:03:11 +01003008 mdev->start_resync_work.cb = w_start_resync;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003009 init_timer(&mdev->resync_timer);
3010 init_timer(&mdev->md_sync_timer);
Philipp Reisner370a43e2011-01-14 16:03:11 +01003011 init_timer(&mdev->start_resync_timer);
Philipp Reisner7fde2be2011-03-01 11:08:28 +01003012 init_timer(&mdev->request_timer);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003013 mdev->resync_timer.function = resync_timer_fn;
3014 mdev->resync_timer.data = (unsigned long) mdev;
3015 mdev->md_sync_timer.function = md_sync_timer_fn;
3016 mdev->md_sync_timer.data = (unsigned long) mdev;
Philipp Reisner370a43e2011-01-14 16:03:11 +01003017 mdev->start_resync_timer.function = start_resync_timer_fn;
3018 mdev->start_resync_timer.data = (unsigned long) mdev;
Philipp Reisner7fde2be2011-03-01 11:08:28 +01003019 mdev->request_timer.function = request_timer_fn;
3020 mdev->request_timer.data = (unsigned long) mdev;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003021
3022 init_waitqueue_head(&mdev->misc_wait);
3023 init_waitqueue_head(&mdev->state_wait);
3024 init_waitqueue_head(&mdev->ee_wait);
3025 init_waitqueue_head(&mdev->al_wait);
3026 init_waitqueue_head(&mdev->seq_wait);
3027
Philipp Reisnere6b3ea82011-01-19 14:02:01 +01003028 drbd_thread_init(mdev, &mdev->tconn->receiver, drbdd_init);
3029 drbd_thread_init(mdev, &mdev->tconn->worker, drbd_worker);
3030 drbd_thread_init(mdev, &mdev->tconn->asender, drbd_asender);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003031
Philipp Reisnerfd340c12011-01-19 16:57:39 +01003032 /* mdev->tconn->agreed_pro_version gets initialized in drbd_connect() */
Philipp Reisner2451fc32010-08-24 13:43:11 +02003033 mdev->write_ordering = WO_bdev_flush;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003034 mdev->resync_wenr = LC_FREE;
Philipp Reisner99432fc2011-05-20 16:39:13 +02003035 mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
3036 mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003037}
3038
3039void drbd_mdev_cleanup(struct drbd_conf *mdev)
3040{
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02003041 int i;
Philipp Reisnere6b3ea82011-01-19 14:02:01 +01003042 if (mdev->tconn->receiver.t_state != NONE)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003043 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
Philipp Reisnere6b3ea82011-01-19 14:02:01 +01003044 mdev->tconn->receiver.t_state);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003045
3046 /* no need to lock it, I'm the only thread alive */
3047 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
3048 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
3049 mdev->al_writ_cnt =
3050 mdev->bm_writ_cnt =
3051 mdev->read_cnt =
3052 mdev->recv_cnt =
3053 mdev->send_cnt =
3054 mdev->writ_cnt =
3055 mdev->p_size =
3056 mdev->rs_start =
3057 mdev->rs_total =
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02003058 mdev->rs_failed = 0;
3059 mdev->rs_last_events = 0;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02003060 mdev->rs_last_sect_ev = 0;
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02003061 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
3062 mdev->rs_mark_left[i] = 0;
3063 mdev->rs_mark_time[i] = 0;
3064 }
Philipp Reisner89e58e72011-01-19 13:12:45 +01003065 D_ASSERT(mdev->tconn->net_conf == NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003066
3067 drbd_set_my_capacity(mdev, 0);
3068 if (mdev->bitmap) {
3069 /* maybe never allocated. */
Philipp Reisner02d9a942010-03-24 16:23:03 +01003070 drbd_bm_resize(mdev, 0, 1);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003071 drbd_bm_cleanup(mdev);
3072 }
3073
3074 drbd_free_resources(mdev);
Philipp Reisner07782862010-08-31 12:00:50 +02003075 clear_bit(AL_SUSPENDED, &mdev->flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003076
3077 /*
3078 * currently we drbd_init_ee only on module load, so
3079 * we may do drbd_release_ee only on module unload!
3080 */
3081 D_ASSERT(list_empty(&mdev->active_ee));
3082 D_ASSERT(list_empty(&mdev->sync_ee));
3083 D_ASSERT(list_empty(&mdev->done_ee));
3084 D_ASSERT(list_empty(&mdev->read_ee));
3085 D_ASSERT(list_empty(&mdev->net_ee));
3086 D_ASSERT(list_empty(&mdev->resync_reads));
Philipp Reisnere42325a2011-01-19 13:55:45 +01003087 D_ASSERT(list_empty(&mdev->tconn->data.work.q));
3088 D_ASSERT(list_empty(&mdev->tconn->meta.work.q));
Philipp Reisnerb411b362009-09-25 16:07:19 -07003089 D_ASSERT(list_empty(&mdev->resync_work.list));
3090 D_ASSERT(list_empty(&mdev->unplug_work.list));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003091 D_ASSERT(list_empty(&mdev->go_diskless.list));
Lars Ellenberg2265b472010-12-16 15:41:26 +01003092
3093 drbd_set_defaults(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003094}
3095
3096
3097static void drbd_destroy_mempools(void)
3098{
3099 struct page *page;
3100
3101 while (drbd_pp_pool) {
3102 page = drbd_pp_pool;
3103 drbd_pp_pool = (struct page *)page_private(page);
3104 __free_page(page);
3105 drbd_pp_vacant--;
3106 }
3107
3108 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
3109
3110 if (drbd_ee_mempool)
3111 mempool_destroy(drbd_ee_mempool);
3112 if (drbd_request_mempool)
3113 mempool_destroy(drbd_request_mempool);
3114 if (drbd_ee_cache)
3115 kmem_cache_destroy(drbd_ee_cache);
3116 if (drbd_request_cache)
3117 kmem_cache_destroy(drbd_request_cache);
3118 if (drbd_bm_ext_cache)
3119 kmem_cache_destroy(drbd_bm_ext_cache);
3120 if (drbd_al_ext_cache)
3121 kmem_cache_destroy(drbd_al_ext_cache);
3122
3123 drbd_ee_mempool = NULL;
3124 drbd_request_mempool = NULL;
3125 drbd_ee_cache = NULL;
3126 drbd_request_cache = NULL;
3127 drbd_bm_ext_cache = NULL;
3128 drbd_al_ext_cache = NULL;
3129
3130 return;
3131}
3132
3133static int drbd_create_mempools(void)
3134{
3135 struct page *page;
Lars Ellenberg1816a2b2010-11-11 15:19:07 +01003136 const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003137 int i;
3138
3139 /* prepare our caches and mempools */
3140 drbd_request_mempool = NULL;
3141 drbd_ee_cache = NULL;
3142 drbd_request_cache = NULL;
3143 drbd_bm_ext_cache = NULL;
3144 drbd_al_ext_cache = NULL;
3145 drbd_pp_pool = NULL;
3146
3147 /* caches */
3148 drbd_request_cache = kmem_cache_create(
3149 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
3150 if (drbd_request_cache == NULL)
3151 goto Enomem;
3152
3153 drbd_ee_cache = kmem_cache_create(
Andreas Gruenbacherf6ffca92011-02-04 15:30:34 +01003154 "drbd_ee", sizeof(struct drbd_peer_request), 0, 0, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003155 if (drbd_ee_cache == NULL)
3156 goto Enomem;
3157
3158 drbd_bm_ext_cache = kmem_cache_create(
3159 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
3160 if (drbd_bm_ext_cache == NULL)
3161 goto Enomem;
3162
3163 drbd_al_ext_cache = kmem_cache_create(
3164 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
3165 if (drbd_al_ext_cache == NULL)
3166 goto Enomem;
3167
3168 /* mempools */
3169 drbd_request_mempool = mempool_create(number,
3170 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
3171 if (drbd_request_mempool == NULL)
3172 goto Enomem;
3173
3174 drbd_ee_mempool = mempool_create(number,
3175 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
Nicolas Kaiser2027ae12010-10-28 06:15:26 -06003176 if (drbd_ee_mempool == NULL)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003177 goto Enomem;
3178
3179 /* drbd's page pool */
3180 spin_lock_init(&drbd_pp_lock);
3181
3182 for (i = 0; i < number; i++) {
3183 page = alloc_page(GFP_HIGHUSER);
3184 if (!page)
3185 goto Enomem;
3186 set_page_private(page, (unsigned long)drbd_pp_pool);
3187 drbd_pp_pool = page;
3188 }
3189 drbd_pp_vacant = number;
3190
3191 return 0;
3192
3193Enomem:
3194 drbd_destroy_mempools(); /* in case we allocated some */
3195 return -ENOMEM;
3196}
3197
3198static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
3199 void *unused)
3200{
3201 /* just so we have it. you never know what interesting things we
3202 * might want to do here some day...
3203 */
3204
3205 return NOTIFY_DONE;
3206}
3207
3208static struct notifier_block drbd_notifier = {
3209 .notifier_call = drbd_notify_sys,
3210};
3211
3212static void drbd_release_ee_lists(struct drbd_conf *mdev)
3213{
3214 int rr;
3215
3216 rr = drbd_release_ee(mdev, &mdev->active_ee);
3217 if (rr)
3218 dev_err(DEV, "%d EEs in active list found!\n", rr);
3219
3220 rr = drbd_release_ee(mdev, &mdev->sync_ee);
3221 if (rr)
3222 dev_err(DEV, "%d EEs in sync list found!\n", rr);
3223
3224 rr = drbd_release_ee(mdev, &mdev->read_ee);
3225 if (rr)
3226 dev_err(DEV, "%d EEs in read list found!\n", rr);
3227
3228 rr = drbd_release_ee(mdev, &mdev->done_ee);
3229 if (rr)
3230 dev_err(DEV, "%d EEs in done list found!\n", rr);
3231
3232 rr = drbd_release_ee(mdev, &mdev->net_ee);
3233 if (rr)
3234 dev_err(DEV, "%d EEs in net list found!\n", rr);
3235}
3236
3237/* caution. no locking.
3238 * currently only used from module cleanup code. */
3239static void drbd_delete_device(unsigned int minor)
3240{
3241 struct drbd_conf *mdev = minor_to_mdev(minor);
3242
3243 if (!mdev)
3244 return;
3245
3246 /* paranoia asserts */
Andreas Gruenbacher70dc65e2010-12-21 14:46:57 +01003247 D_ASSERT(mdev->open_cnt == 0);
Philipp Reisnere42325a2011-01-19 13:55:45 +01003248 D_ASSERT(list_empty(&mdev->tconn->data.work.q));
Philipp Reisnerb411b362009-09-25 16:07:19 -07003249 /* end paranoia asserts */
3250
3251 del_gendisk(mdev->vdisk);
3252
3253 /* cleanup stuff that may have been allocated during
3254 * device (re-)configuration or state changes */
3255
3256 if (mdev->this_bdev)
3257 bdput(mdev->this_bdev);
3258
3259 drbd_free_resources(mdev);
Philipp Reisner21114382011-01-19 12:26:59 +01003260 drbd_free_tconn(mdev->tconn);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003261
3262 drbd_release_ee_lists(mdev);
3263
Philipp Reisnerb411b362009-09-25 16:07:19 -07003264 lc_destroy(mdev->act_log);
3265 lc_destroy(mdev->resync);
3266
3267 kfree(mdev->p_uuid);
3268 /* mdev->p_uuid = NULL; */
3269
Philipp Reisnerb411b362009-09-25 16:07:19 -07003270 /* cleanup the rest that has been
3271 * allocated from drbd_new_device
3272 * and actually free the mdev itself */
3273 drbd_free_mdev(mdev);
3274}
3275
3276static void drbd_cleanup(void)
3277{
3278 unsigned int i;
3279
3280 unregister_reboot_notifier(&drbd_notifier);
3281
Lars Ellenberg17a93f32010-11-24 10:37:35 +01003282 /* first remove proc,
3283 * drbdsetup uses it's presence to detect
3284 * whether DRBD is loaded.
3285 * If we would get stuck in proc removal,
3286 * but have netlink already deregistered,
3287 * some drbdsetup commands may wait forever
3288 * for an answer.
3289 */
3290 if (drbd_proc)
3291 remove_proc_entry("drbd", NULL);
3292
Philipp Reisnerb411b362009-09-25 16:07:19 -07003293 drbd_nl_cleanup();
3294
3295 if (minor_table) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003296 i = minor_count;
3297 while (i--)
3298 drbd_delete_device(i);
3299 drbd_destroy_mempools();
3300 }
3301
3302 kfree(minor_table);
3303
3304 unregister_blkdev(DRBD_MAJOR, "drbd");
3305
3306 printk(KERN_INFO "drbd: module cleanup done.\n");
3307}
3308
3309/**
3310 * drbd_congested() - Callback for pdflush
3311 * @congested_data: User data
3312 * @bdi_bits: Bits pdflush is currently interested in
3313 *
3314 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3315 */
3316static int drbd_congested(void *congested_data, int bdi_bits)
3317{
3318 struct drbd_conf *mdev = congested_data;
3319 struct request_queue *q;
3320 char reason = '-';
3321 int r = 0;
3322
Andreas Gruenbacher1b881ef2010-12-13 18:03:38 +01003323 if (!may_inc_ap_bio(mdev)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003324 /* DRBD has frozen IO */
3325 r = bdi_bits;
3326 reason = 'd';
3327 goto out;
3328 }
3329
3330 if (get_ldev(mdev)) {
3331 q = bdev_get_queue(mdev->ldev->backing_bdev);
3332 r = bdi_congested(&q->backing_dev_info, bdi_bits);
3333 put_ldev(mdev);
3334 if (r)
3335 reason = 'b';
3336 }
3337
3338 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3339 r |= (1 << BDI_async_congested);
3340 reason = reason == 'b' ? 'a' : 'n';
3341 }
3342
3343out:
3344 mdev->congestion_reason = reason;
3345 return r;
3346}
3347
Philipp Reisner21114382011-01-19 12:26:59 +01003348struct drbd_tconn *drbd_new_tconn(char *name)
3349{
3350 struct drbd_tconn *tconn;
3351
3352 tconn = kzalloc(sizeof(struct drbd_tconn), GFP_KERNEL);
3353 if (!tconn)
3354 return NULL;
3355
3356 tconn->name = kstrdup(name, GFP_KERNEL);
3357 if (!tconn->name)
3358 goto fail;
3359
Philipp Reisnerb2fb6dbe2011-01-19 13:48:44 +01003360 atomic_set(&tconn->net_cnt, 0);
3361 init_waitqueue_head(&tconn->net_cnt_wait);
3362
Philipp Reisner21114382011-01-19 12:26:59 +01003363 write_lock_irq(&global_state_lock);
3364 list_add(&tconn->all_tconn, &drbd_tconns);
3365 write_unlock_irq(&global_state_lock);
3366
3367 return tconn;
3368
3369fail:
3370 kfree(tconn->name);
3371 kfree(tconn);
3372
3373 return NULL;
3374}
3375
3376void drbd_free_tconn(struct drbd_tconn *tconn)
3377{
3378 write_lock_irq(&global_state_lock);
3379 list_del(&tconn->all_tconn);
3380 write_unlock_irq(&global_state_lock);
3381
3382 kfree(tconn->name);
Philipp Reisnerb42a70a2011-01-27 10:55:20 +01003383 kfree(tconn->int_dig_out);
3384 kfree(tconn->int_dig_in);
3385 kfree(tconn->int_dig_vv);
Philipp Reisner21114382011-01-19 12:26:59 +01003386 kfree(tconn);
3387}
3388
Philipp Reisnerb411b362009-09-25 16:07:19 -07003389struct drbd_conf *drbd_new_device(unsigned int minor)
3390{
3391 struct drbd_conf *mdev;
3392 struct gendisk *disk;
3393 struct request_queue *q;
3394
3395 /* GFP_KERNEL, we are outside of all write-out paths */
3396 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3397 if (!mdev)
3398 return NULL;
Philipp Reisner21114382011-01-19 12:26:59 +01003399 mdev->tconn = drbd_new_tconn("dummy");
3400 if (!mdev->tconn)
3401 goto out_no_tconn;
3402
Philipp Reisnerb411b362009-09-25 16:07:19 -07003403 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3404 goto out_no_cpumask;
3405
Philipp Reisner21114382011-01-19 12:26:59 +01003406 mdev->tconn->volume0 = mdev;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003407 mdev->minor = minor;
3408
3409 drbd_init_set_defaults(mdev);
3410
3411 q = blk_alloc_queue(GFP_KERNEL);
3412 if (!q)
3413 goto out_no_q;
3414 mdev->rq_queue = q;
3415 q->queuedata = mdev;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003416
3417 disk = alloc_disk(1);
3418 if (!disk)
3419 goto out_no_disk;
3420 mdev->vdisk = disk;
3421
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003422 set_disk_ro(disk, true);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003423
3424 disk->queue = q;
3425 disk->major = DRBD_MAJOR;
3426 disk->first_minor = minor;
3427 disk->fops = &drbd_ops;
3428 sprintf(disk->disk_name, "drbd%d", minor);
3429 disk->private_data = mdev;
3430
3431 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3432 /* we have no partitions. we contain only ourselves. */
3433 mdev->this_bdev->bd_contains = mdev->this_bdev;
3434
3435 q->backing_dev_info.congested_fn = drbd_congested;
3436 q->backing_dev_info.congested_data = mdev;
3437
Andreas Gruenbacher2f58dcf2010-12-13 17:48:19 +01003438 blk_queue_make_request(q, drbd_make_request);
Philipp Reisner99432fc2011-05-20 16:39:13 +02003439 /* Setting the max_hw_sectors to an odd value of 8kibyte here
3440 This triggers a max_bio_size message upon first attach or connect */
3441 blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003442 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3443 blk_queue_merge_bvec(q, drbd_merge_bvec);
Philipp Reisner87eeee42011-01-19 14:16:30 +01003444 q->queue_lock = &mdev->tconn->req_lock; /* needed since we use */
Philipp Reisnerb411b362009-09-25 16:07:19 -07003445
3446 mdev->md_io_page = alloc_page(GFP_KERNEL);
3447 if (!mdev->md_io_page)
3448 goto out_no_io_page;
3449
3450 if (drbd_bm_init(mdev))
3451 goto out_no_bitmap;
3452 /* no need to lock access, we are still initializing this minor device. */
3453 if (!tl_init(mdev))
3454 goto out_no_tl;
Andreas Gruenbacherdac13892011-01-21 17:18:39 +01003455 mdev->read_requests = RB_ROOT;
Andreas Gruenbacherde696712011-01-20 15:00:24 +01003456 mdev->write_requests = RB_ROOT;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003457
Philipp Reisnerb411b362009-09-25 16:07:19 -07003458 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3459 if (!mdev->current_epoch)
3460 goto out_no_epoch;
3461
3462 INIT_LIST_HEAD(&mdev->current_epoch->list);
3463 mdev->epochs = 1;
3464
3465 return mdev;
3466
3467/* out_whatever_else:
3468 kfree(mdev->current_epoch); */
3469out_no_epoch:
Philipp Reisnerb411b362009-09-25 16:07:19 -07003470 tl_cleanup(mdev);
3471out_no_tl:
3472 drbd_bm_cleanup(mdev);
3473out_no_bitmap:
3474 __free_page(mdev->md_io_page);
3475out_no_io_page:
3476 put_disk(disk);
3477out_no_disk:
3478 blk_cleanup_queue(q);
3479out_no_q:
3480 free_cpumask_var(mdev->cpu_mask);
3481out_no_cpumask:
Philipp Reisner21114382011-01-19 12:26:59 +01003482 drbd_free_tconn(mdev->tconn);
3483out_no_tconn:
Philipp Reisnerb411b362009-09-25 16:07:19 -07003484 kfree(mdev);
3485 return NULL;
3486}
3487
3488/* counterpart of drbd_new_device.
3489 * last part of drbd_delete_device. */
3490void drbd_free_mdev(struct drbd_conf *mdev)
3491{
3492 kfree(mdev->current_epoch);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003493 tl_cleanup(mdev);
3494 if (mdev->bitmap) /* should no longer be there. */
3495 drbd_bm_cleanup(mdev);
3496 __free_page(mdev->md_io_page);
3497 put_disk(mdev->vdisk);
3498 blk_cleanup_queue(mdev->rq_queue);
3499 free_cpumask_var(mdev->cpu_mask);
3500 kfree(mdev);
3501}
3502
3503
3504int __init drbd_init(void)
3505{
3506 int err;
3507
Philipp Reisnerfd340c12011-01-19 16:57:39 +01003508 BUILD_BUG_ON(sizeof(struct p_header80) != sizeof(struct p_header95));
3509 BUILD_BUG_ON(sizeof(struct p_handshake) != 80);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003510
Philipp Reisner2b8a90b2011-01-10 11:15:17 +01003511 if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003512 printk(KERN_ERR
3513 "drbd: invalid minor_count (%d)\n", minor_count);
3514#ifdef MODULE
3515 return -EINVAL;
3516#else
3517 minor_count = 8;
3518#endif
3519 }
3520
3521 err = drbd_nl_init();
3522 if (err)
3523 return err;
3524
3525 err = register_blkdev(DRBD_MAJOR, "drbd");
3526 if (err) {
3527 printk(KERN_ERR
3528 "drbd: unable to register block device major %d\n",
3529 DRBD_MAJOR);
3530 return err;
3531 }
3532
3533 register_reboot_notifier(&drbd_notifier);
3534
3535 /*
3536 * allocate all necessary structs
3537 */
3538 err = -ENOMEM;
3539
3540 init_waitqueue_head(&drbd_pp_wait);
3541
3542 drbd_proc = NULL; /* play safe for drbd_cleanup */
3543 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3544 GFP_KERNEL);
3545 if (!minor_table)
3546 goto Enomem;
3547
3548 err = drbd_create_mempools();
3549 if (err)
3550 goto Enomem;
3551
Lars Ellenberg8c484ee2010-03-11 16:47:58 +01003552 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003553 if (!drbd_proc) {
3554 printk(KERN_ERR "drbd: unable to register proc file\n");
3555 goto Enomem;
3556 }
3557
3558 rwlock_init(&global_state_lock);
Philipp Reisner21114382011-01-19 12:26:59 +01003559 INIT_LIST_HEAD(&drbd_tconns);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003560
3561 printk(KERN_INFO "drbd: initialized. "
3562 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3563 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3564 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3565 printk(KERN_INFO "drbd: registered as block device major %d\n",
3566 DRBD_MAJOR);
3567 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3568
3569 return 0; /* Success! */
3570
3571Enomem:
3572 drbd_cleanup();
3573 if (err == -ENOMEM)
3574 /* currently always the case */
3575 printk(KERN_ERR "drbd: ran out of memory\n");
3576 else
3577 printk(KERN_ERR "drbd: initialization failure\n");
3578 return err;
3579}
3580
3581void drbd_free_bc(struct drbd_backing_dev *ldev)
3582{
3583 if (ldev == NULL)
3584 return;
3585
Tejun Heoe525fd82010-11-13 11:55:17 +01003586 blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3587 blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003588
3589 kfree(ldev);
3590}
3591
3592void drbd_free_sock(struct drbd_conf *mdev)
3593{
Philipp Reisnere42325a2011-01-19 13:55:45 +01003594 if (mdev->tconn->data.socket) {
3595 mutex_lock(&mdev->tconn->data.mutex);
3596 kernel_sock_shutdown(mdev->tconn->data.socket, SHUT_RDWR);
3597 sock_release(mdev->tconn->data.socket);
3598 mdev->tconn->data.socket = NULL;
3599 mutex_unlock(&mdev->tconn->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003600 }
Philipp Reisnere42325a2011-01-19 13:55:45 +01003601 if (mdev->tconn->meta.socket) {
3602 mutex_lock(&mdev->tconn->meta.mutex);
3603 kernel_sock_shutdown(mdev->tconn->meta.socket, SHUT_RDWR);
3604 sock_release(mdev->tconn->meta.socket);
3605 mdev->tconn->meta.socket = NULL;
3606 mutex_unlock(&mdev->tconn->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003607 }
3608}
3609
3610
3611void drbd_free_resources(struct drbd_conf *mdev)
3612{
3613 crypto_free_hash(mdev->csums_tfm);
3614 mdev->csums_tfm = NULL;
3615 crypto_free_hash(mdev->verify_tfm);
3616 mdev->verify_tfm = NULL;
Philipp Reisnera0638452011-01-19 14:31:32 +01003617 crypto_free_hash(mdev->tconn->cram_hmac_tfm);
3618 mdev->tconn->cram_hmac_tfm = NULL;
3619 crypto_free_hash(mdev->tconn->integrity_w_tfm);
3620 mdev->tconn->integrity_w_tfm = NULL;
3621 crypto_free_hash(mdev->tconn->integrity_r_tfm);
3622 mdev->tconn->integrity_r_tfm = NULL;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003623
3624 drbd_free_sock(mdev);
3625
3626 __no_warn(local,
3627 drbd_free_bc(mdev->ldev);
3628 mdev->ldev = NULL;);
3629}
3630
3631/* meta data management */
3632
3633struct meta_data_on_disk {
3634 u64 la_size; /* last agreed size. */
3635 u64 uuid[UI_SIZE]; /* UUIDs. */
3636 u64 device_uuid;
3637 u64 reserved_u64_1;
3638 u32 flags; /* MDF */
3639 u32 magic;
3640 u32 md_size_sect;
3641 u32 al_offset; /* offset to this block */
3642 u32 al_nr_extents; /* important for restoring the AL */
3643 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3644 u32 bm_offset; /* offset to the bitmap, from here */
3645 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
Philipp Reisner99432fc2011-05-20 16:39:13 +02003646 u32 la_peer_max_bio_size; /* last peer max_bio_size */
3647 u32 reserved_u32[3];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003648
3649} __packed;
3650
3651/**
3652 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3653 * @mdev: DRBD device.
3654 */
3655void drbd_md_sync(struct drbd_conf *mdev)
3656{
3657 struct meta_data_on_disk *buffer;
3658 sector_t sector;
3659 int i;
3660
Lars Ellenbergee15b032010-09-03 10:00:09 +02003661 del_timer(&mdev->md_sync_timer);
3662 /* timer may be rearmed by drbd_md_mark_dirty() now. */
Philipp Reisnerb411b362009-09-25 16:07:19 -07003663 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3664 return;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003665
3666 /* We use here D_FAILED and not D_ATTACHING because we try to write
3667 * metadata even if we detach due to a disk failure! */
3668 if (!get_ldev_if_state(mdev, D_FAILED))
3669 return;
3670
Philipp Reisnerb411b362009-09-25 16:07:19 -07003671 mutex_lock(&mdev->md_io_mutex);
3672 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3673 memset(buffer, 0, 512);
3674
3675 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3676 for (i = UI_CURRENT; i < UI_SIZE; i++)
3677 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3678 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3679 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3680
3681 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
3682 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
3683 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3684 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3685 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3686
3687 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
Philipp Reisner99432fc2011-05-20 16:39:13 +02003688 buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003689
3690 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3691 sector = mdev->ldev->md.md_offset;
3692
Lars Ellenberg3f3a9b82010-09-01 15:12:12 +02003693 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003694 /* this was a try anyways ... */
3695 dev_err(DEV, "meta data update failed!\n");
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003696 drbd_chk_io_error(mdev, 1, true);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003697 }
3698
3699 /* Update mdev->ldev->md.la_size_sect,
3700 * since we updated it on metadata. */
3701 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3702
3703 mutex_unlock(&mdev->md_io_mutex);
3704 put_ldev(mdev);
3705}
3706
3707/**
3708 * drbd_md_read() - Reads in the meta data super block
3709 * @mdev: DRBD device.
3710 * @bdev: Device from which the meta data should be read in.
3711 *
Andreas Gruenbacher116676c2010-12-08 13:33:11 +01003712 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
Philipp Reisnerb411b362009-09-25 16:07:19 -07003713 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3714 */
3715int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3716{
3717 struct meta_data_on_disk *buffer;
3718 int i, rv = NO_ERROR;
3719
3720 if (!get_ldev_if_state(mdev, D_ATTACHING))
3721 return ERR_IO_MD_DISK;
3722
Philipp Reisnerb411b362009-09-25 16:07:19 -07003723 mutex_lock(&mdev->md_io_mutex);
3724 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3725
3726 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
Lucas De Marchi25985ed2011-03-30 22:57:33 -03003727 /* NOTE: can't do normal error processing here as this is
Philipp Reisnerb411b362009-09-25 16:07:19 -07003728 called BEFORE disk is attached */
3729 dev_err(DEV, "Error while reading metadata.\n");
3730 rv = ERR_IO_MD_DISK;
3731 goto err;
3732 }
3733
Andreas Gruenbachere7fad8a2011-01-11 13:54:02 +01003734 if (buffer->magic != cpu_to_be32(DRBD_MD_MAGIC)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003735 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3736 rv = ERR_MD_INVALID;
3737 goto err;
3738 }
3739 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3740 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3741 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3742 rv = ERR_MD_INVALID;
3743 goto err;
3744 }
3745 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3746 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3747 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3748 rv = ERR_MD_INVALID;
3749 goto err;
3750 }
3751 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3752 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3753 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3754 rv = ERR_MD_INVALID;
3755 goto err;
3756 }
3757
3758 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3759 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3760 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3761 rv = ERR_MD_INVALID;
3762 goto err;
3763 }
3764
3765 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3766 for (i = UI_CURRENT; i < UI_SIZE; i++)
3767 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3768 bdev->md.flags = be32_to_cpu(buffer->flags);
3769 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3770 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3771
Philipp Reisner87eeee42011-01-19 14:16:30 +01003772 spin_lock_irq(&mdev->tconn->req_lock);
Philipp Reisner99432fc2011-05-20 16:39:13 +02003773 if (mdev->state.conn < C_CONNECTED) {
3774 int peer;
3775 peer = be32_to_cpu(buffer->la_peer_max_bio_size);
3776 peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE);
3777 mdev->peer_max_bio_size = peer;
3778 }
Philipp Reisner87eeee42011-01-19 14:16:30 +01003779 spin_unlock_irq(&mdev->tconn->req_lock);
Philipp Reisner99432fc2011-05-20 16:39:13 +02003780
Philipp Reisnerb411b362009-09-25 16:07:19 -07003781 if (mdev->sync_conf.al_extents < 7)
3782 mdev->sync_conf.al_extents = 127;
3783
3784 err:
3785 mutex_unlock(&mdev->md_io_mutex);
3786 put_ldev(mdev);
3787
3788 return rv;
3789}
3790
3791/**
3792 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3793 * @mdev: DRBD device.
3794 *
3795 * Call this function if you change anything that should be written to
3796 * the meta-data super block. This function sets MD_DIRTY, and starts a
3797 * timer that ensures that within five seconds you have to call drbd_md_sync().
3798 */
Lars Ellenbergca0e6092010-10-14 15:01:21 +02003799#ifdef DEBUG
Lars Ellenbergee15b032010-09-03 10:00:09 +02003800void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
3801{
3802 if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
3803 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
3804 mdev->last_md_mark_dirty.line = line;
3805 mdev->last_md_mark_dirty.func = func;
3806 }
3807}
3808#else
Philipp Reisnerb411b362009-09-25 16:07:19 -07003809void drbd_md_mark_dirty(struct drbd_conf *mdev)
3810{
Lars Ellenbergee15b032010-09-03 10:00:09 +02003811 if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
Lars Ellenbergca0e6092010-10-14 15:01:21 +02003812 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003813}
Lars Ellenbergee15b032010-09-03 10:00:09 +02003814#endif
Philipp Reisnerb411b362009-09-25 16:07:19 -07003815
3816static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3817{
3818 int i;
3819
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003820 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003821 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003822}
3823
3824void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3825{
3826 if (idx == UI_CURRENT) {
3827 if (mdev->state.role == R_PRIMARY)
3828 val |= 1;
3829 else
3830 val &= ~((u64)1);
3831
3832 drbd_set_ed_uuid(mdev, val);
3833 }
3834
3835 mdev->ldev->md.uuid[idx] = val;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003836 drbd_md_mark_dirty(mdev);
3837}
3838
3839
3840void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3841{
3842 if (mdev->ldev->md.uuid[idx]) {
3843 drbd_uuid_move_history(mdev);
3844 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003845 }
3846 _drbd_uuid_set(mdev, idx, val);
3847}
3848
3849/**
3850 * drbd_uuid_new_current() - Creates a new current UUID
3851 * @mdev: DRBD device.
3852 *
3853 * Creates a new current UUID, and rotates the old current UUID into
3854 * the bitmap slot. Causes an incremental resync upon next connect.
3855 */
3856void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3857{
3858 u64 val;
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003859 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003860
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003861 if (bm_uuid)
3862 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
3863
Philipp Reisnerb411b362009-09-25 16:07:19 -07003864 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003865
3866 get_random_bytes(&val, sizeof(u64));
3867 _drbd_uuid_set(mdev, UI_CURRENT, val);
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003868 drbd_print_uuids(mdev, "new current UUID");
Lars Ellenbergaaa8e2b2010-10-15 13:16:53 +02003869 /* get it to stable storage _now_ */
3870 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003871}
3872
3873void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3874{
3875 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3876 return;
3877
3878 if (val == 0) {
3879 drbd_uuid_move_history(mdev);
3880 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3881 mdev->ldev->md.uuid[UI_BITMAP] = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003882 } else {
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003883 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
3884 if (bm_uuid)
3885 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003886
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003887 mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003888 }
3889 drbd_md_mark_dirty(mdev);
3890}
3891
3892/**
3893 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3894 * @mdev: DRBD device.
3895 *
3896 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3897 */
3898int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3899{
3900 int rv = -EIO;
3901
3902 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3903 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3904 drbd_md_sync(mdev);
3905 drbd_bm_set_all(mdev);
3906
3907 rv = drbd_bm_write(mdev);
3908
3909 if (!rv) {
3910 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3911 drbd_md_sync(mdev);
3912 }
3913
3914 put_ldev(mdev);
3915 }
3916
3917 return rv;
3918}
3919
3920/**
3921 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3922 * @mdev: DRBD device.
3923 *
3924 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3925 */
3926int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3927{
3928 int rv = -EIO;
3929
Philipp Reisner07782862010-08-31 12:00:50 +02003930 drbd_resume_al(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003931 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3932 drbd_bm_clear_all(mdev);
3933 rv = drbd_bm_write(mdev);
3934 put_ldev(mdev);
3935 }
3936
3937 return rv;
3938}
3939
3940static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3941{
3942 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
Lars Ellenberg02851e92010-12-16 14:47:39 +01003943 int rv = -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003944
3945 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3946
Lars Ellenberg02851e92010-12-16 14:47:39 +01003947 if (get_ldev(mdev)) {
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01003948 drbd_bm_lock(mdev, work->why, work->flags);
Lars Ellenberg02851e92010-12-16 14:47:39 +01003949 rv = work->io_fn(mdev);
3950 drbd_bm_unlock(mdev);
3951 put_ldev(mdev);
3952 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07003953
3954 clear_bit(BITMAP_IO, &mdev->flags);
Philipp Reisner127b3172010-11-16 10:07:53 +01003955 smp_mb__after_clear_bit();
Philipp Reisnerb411b362009-09-25 16:07:19 -07003956 wake_up(&mdev->misc_wait);
3957
3958 if (work->done)
3959 work->done(mdev, rv);
3960
3961 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3962 work->why = NULL;
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01003963 work->flags = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003964
3965 return 1;
3966}
3967
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02003968void drbd_ldev_destroy(struct drbd_conf *mdev)
3969{
3970 lc_destroy(mdev->resync);
3971 mdev->resync = NULL;
3972 lc_destroy(mdev->act_log);
3973 mdev->act_log = NULL;
3974 __no_warn(local,
3975 drbd_free_bc(mdev->ldev);
3976 mdev->ldev = NULL;);
3977
3978 if (mdev->md_io_tmpp) {
3979 __free_page(mdev->md_io_tmpp);
3980 mdev->md_io_tmpp = NULL;
3981 }
3982 clear_bit(GO_DISKLESS, &mdev->flags);
3983}
3984
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003985static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3986{
3987 D_ASSERT(mdev->state.disk == D_FAILED);
Lars Ellenberg9d282872010-10-14 13:57:07 +02003988 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
3989 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02003990 * the protected members anymore, though, so once put_ldev reaches zero
3991 * again, it will be safe to free them. */
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003992 drbd_force_state(mdev, NS(disk, D_DISKLESS));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003993 return 1;
3994}
3995
3996void drbd_go_diskless(struct drbd_conf *mdev)
3997{
3998 D_ASSERT(mdev->state.disk == D_FAILED);
3999 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
Philipp Reisnere42325a2011-01-19 13:55:45 +01004000 drbd_queue_work(&mdev->tconn->data.work, &mdev->go_diskless);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02004001}
4002
Philipp Reisnerb411b362009-09-25 16:07:19 -07004003/**
4004 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
4005 * @mdev: DRBD device.
4006 * @io_fn: IO callback to be called when bitmap IO is possible
4007 * @done: callback to be called after the bitmap IO was performed
4008 * @why: Descriptive text of the reason for doing the IO
4009 *
4010 * While IO on the bitmap happens we freeze application IO thus we ensure
4011 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
4012 * called from worker context. It MUST NOT be used while a previous such
4013 * work is still pending!
4014 */
4015void drbd_queue_bitmap_io(struct drbd_conf *mdev,
4016 int (*io_fn)(struct drbd_conf *),
4017 void (*done)(struct drbd_conf *, int),
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004018 char *why, enum bm_flag flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004019{
Philipp Reisnere6b3ea82011-01-19 14:02:01 +01004020 D_ASSERT(current == mdev->tconn->worker.task);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004021
4022 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
4023 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
4024 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
4025 if (mdev->bm_io_work.why)
4026 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
4027 why, mdev->bm_io_work.why);
4028
4029 mdev->bm_io_work.io_fn = io_fn;
4030 mdev->bm_io_work.done = done;
4031 mdev->bm_io_work.why = why;
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004032 mdev->bm_io_work.flags = flags;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004033
Philipp Reisner87eeee42011-01-19 14:16:30 +01004034 spin_lock_irq(&mdev->tconn->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004035 set_bit(BITMAP_IO, &mdev->flags);
4036 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
Philipp Reisner127b3172010-11-16 10:07:53 +01004037 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
Philipp Reisnere42325a2011-01-19 13:55:45 +01004038 drbd_queue_work(&mdev->tconn->data.work, &mdev->bm_io_work.w);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004039 }
Philipp Reisner87eeee42011-01-19 14:16:30 +01004040 spin_unlock_irq(&mdev->tconn->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004041}
4042
4043/**
4044 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
4045 * @mdev: DRBD device.
4046 * @io_fn: IO callback to be called when bitmap IO is possible
4047 * @why: Descriptive text of the reason for doing the IO
4048 *
4049 * freezes application IO while that the actual IO operations runs. This
4050 * functions MAY NOT be called from worker context.
4051 */
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004052int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *),
4053 char *why, enum bm_flag flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004054{
4055 int rv;
4056
Philipp Reisnere6b3ea82011-01-19 14:02:01 +01004057 D_ASSERT(current != mdev->tconn->worker.task);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004058
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004059 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4060 drbd_suspend_io(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004061
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004062 drbd_bm_lock(mdev, why, flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004063 rv = io_fn(mdev);
4064 drbd_bm_unlock(mdev);
4065
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004066 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4067 drbd_resume_io(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004068
4069 return rv;
4070}
4071
4072void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4073{
4074 if ((mdev->ldev->md.flags & flag) != flag) {
4075 drbd_md_mark_dirty(mdev);
4076 mdev->ldev->md.flags |= flag;
4077 }
4078}
4079
4080void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4081{
4082 if ((mdev->ldev->md.flags & flag) != 0) {
4083 drbd_md_mark_dirty(mdev);
4084 mdev->ldev->md.flags &= ~flag;
4085 }
4086}
4087int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
4088{
4089 return (bdev->md.flags & flag) != 0;
4090}
4091
4092static void md_sync_timer_fn(unsigned long data)
4093{
4094 struct drbd_conf *mdev = (struct drbd_conf *) data;
4095
Philipp Reisnere42325a2011-01-19 13:55:45 +01004096 drbd_queue_work_front(&mdev->tconn->data.work, &mdev->md_sync_work);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004097}
4098
4099static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4100{
4101 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
Lars Ellenbergee15b032010-09-03 10:00:09 +02004102#ifdef DEBUG
4103 dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
4104 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
4105#endif
Philipp Reisnerb411b362009-09-25 16:07:19 -07004106 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004107 return 1;
4108}
4109
Andreas Gruenbacherd8763022011-01-26 17:39:41 +01004110const char *cmdname(enum drbd_packet cmd)
Andreas Gruenbacherf2ad9062011-01-26 17:13:25 +01004111{
4112 /* THINK may need to become several global tables
4113 * when we want to support more than
4114 * one PRO_VERSION */
4115 static const char *cmdnames[] = {
4116 [P_DATA] = "Data",
4117 [P_DATA_REPLY] = "DataReply",
4118 [P_RS_DATA_REPLY] = "RSDataReply",
4119 [P_BARRIER] = "Barrier",
4120 [P_BITMAP] = "ReportBitMap",
4121 [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget",
4122 [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource",
4123 [P_UNPLUG_REMOTE] = "UnplugRemote",
4124 [P_DATA_REQUEST] = "DataRequest",
4125 [P_RS_DATA_REQUEST] = "RSDataRequest",
4126 [P_SYNC_PARAM] = "SyncParam",
4127 [P_SYNC_PARAM89] = "SyncParam89",
4128 [P_PROTOCOL] = "ReportProtocol",
4129 [P_UUIDS] = "ReportUUIDs",
4130 [P_SIZES] = "ReportSizes",
4131 [P_STATE] = "ReportState",
4132 [P_SYNC_UUID] = "ReportSyncUUID",
4133 [P_AUTH_CHALLENGE] = "AuthChallenge",
4134 [P_AUTH_RESPONSE] = "AuthResponse",
4135 [P_PING] = "Ping",
4136 [P_PING_ACK] = "PingAck",
4137 [P_RECV_ACK] = "RecvAck",
4138 [P_WRITE_ACK] = "WriteAck",
4139 [P_RS_WRITE_ACK] = "RSWriteAck",
4140 [P_DISCARD_ACK] = "DiscardAck",
4141 [P_NEG_ACK] = "NegAck",
4142 [P_NEG_DREPLY] = "NegDReply",
4143 [P_NEG_RS_DREPLY] = "NegRSDReply",
4144 [P_BARRIER_ACK] = "BarrierAck",
4145 [P_STATE_CHG_REQ] = "StateChgRequest",
4146 [P_STATE_CHG_REPLY] = "StateChgReply",
4147 [P_OV_REQUEST] = "OVRequest",
4148 [P_OV_REPLY] = "OVReply",
4149 [P_OV_RESULT] = "OVResult",
4150 [P_CSUM_RS_REQUEST] = "CsumRSRequest",
4151 [P_RS_IS_IN_SYNC] = "CsumRSIsInSync",
4152 [P_COMPRESSED_BITMAP] = "CBitmap",
4153 [P_DELAY_PROBE] = "DelayProbe",
4154 [P_OUT_OF_SYNC] = "OutOfSync",
4155 [P_MAX_CMD] = NULL,
4156 };
4157
4158 if (cmd == P_HAND_SHAKE_M)
4159 return "HandShakeM";
4160 if (cmd == P_HAND_SHAKE_S)
4161 return "HandShakeS";
4162 if (cmd == P_HAND_SHAKE)
4163 return "HandShake";
4164 if (cmd >= P_MAX_CMD)
4165 return "Unknown";
4166 return cmdnames[cmd];
4167}
4168
Philipp Reisnerb411b362009-09-25 16:07:19 -07004169#ifdef CONFIG_DRBD_FAULT_INJECTION
4170/* Fault insertion support including random number generator shamelessly
4171 * stolen from kernel/rcutorture.c */
4172struct fault_random_state {
4173 unsigned long state;
4174 unsigned long count;
4175};
4176
4177#define FAULT_RANDOM_MULT 39916801 /* prime */
4178#define FAULT_RANDOM_ADD 479001701 /* prime */
4179#define FAULT_RANDOM_REFRESH 10000
4180
4181/*
4182 * Crude but fast random-number generator. Uses a linear congruential
4183 * generator, with occasional help from get_random_bytes().
4184 */
4185static unsigned long
4186_drbd_fault_random(struct fault_random_state *rsp)
4187{
4188 long refresh;
4189
Roel Kluin49829ea2009-12-15 22:55:44 +01004190 if (!rsp->count--) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07004191 get_random_bytes(&refresh, sizeof(refresh));
4192 rsp->state += refresh;
4193 rsp->count = FAULT_RANDOM_REFRESH;
4194 }
4195 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
4196 return swahw32(rsp->state);
4197}
4198
4199static char *
4200_drbd_fault_str(unsigned int type) {
4201 static char *_faults[] = {
4202 [DRBD_FAULT_MD_WR] = "Meta-data write",
4203 [DRBD_FAULT_MD_RD] = "Meta-data read",
4204 [DRBD_FAULT_RS_WR] = "Resync write",
4205 [DRBD_FAULT_RS_RD] = "Resync read",
4206 [DRBD_FAULT_DT_WR] = "Data write",
4207 [DRBD_FAULT_DT_RD] = "Data read",
4208 [DRBD_FAULT_DT_RA] = "Data read ahead",
4209 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
Philipp Reisner6b4388a2010-04-26 14:11:45 +02004210 [DRBD_FAULT_AL_EE] = "EE allocation",
4211 [DRBD_FAULT_RECEIVE] = "receive data corruption",
Philipp Reisnerb411b362009-09-25 16:07:19 -07004212 };
4213
4214 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
4215}
4216
4217unsigned int
4218_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
4219{
4220 static struct fault_random_state rrs = {0, 0};
4221
4222 unsigned int ret = (
4223 (fault_devs == 0 ||
4224 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
4225 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
4226
4227 if (ret) {
4228 fault_count++;
4229
Lars Ellenberg73835062010-05-27 11:51:56 +02004230 if (__ratelimit(&drbd_ratelimit_state))
Philipp Reisnerb411b362009-09-25 16:07:19 -07004231 dev_warn(DEV, "***Simulating %s failure\n",
4232 _drbd_fault_str(type));
4233 }
4234
4235 return ret;
4236}
4237#endif
4238
4239const char *drbd_buildtag(void)
4240{
4241 /* DRBD built from external sources has here a reference to the
4242 git hash of the source code. */
4243
4244 static char buildtag[38] = "\0uilt-in";
4245
4246 if (buildtag[0] == 0) {
4247#ifdef CONFIG_MODULES
4248 if (THIS_MODULE != NULL)
4249 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
4250 else
4251#endif
4252 buildtag[0] = 'b';
4253 }
4254
4255 return buildtag;
4256}
4257
4258module_init(drbd_init)
4259module_exit(drbd_cleanup)
4260
Philipp Reisnerb411b362009-09-25 16:07:19 -07004261EXPORT_SYMBOL(drbd_conn_str);
4262EXPORT_SYMBOL(drbd_role_str);
4263EXPORT_SYMBOL(drbd_disk_str);
4264EXPORT_SYMBOL(drbd_set_st_err_str);