blob: fd308864833f1b3ea7776c98c1320f5737042f26 [file] [log] [blame]
Philipp Reisnerb411b362009-09-25 16:07:19 -07001/*
2 drbd.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
12
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
16 any later version.
17
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
22
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
26
27 */
28
Philipp Reisnerb411b362009-09-25 16:07:19 -070029#include <linux/module.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070030#include <linux/drbd.h>
31#include <asm/uaccess.h>
32#include <asm/types.h>
33#include <net/sock.h>
34#include <linux/ctype.h>
Arnd Bergmann2a48fc02010-06-02 14:28:52 +020035#include <linux/mutex.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070036#include <linux/fs.h>
37#include <linux/file.h>
38#include <linux/proc_fs.h>
39#include <linux/init.h>
40#include <linux/mm.h>
41#include <linux/memcontrol.h>
42#include <linux/mm_inline.h>
43#include <linux/slab.h>
44#include <linux/random.h>
45#include <linux/reboot.h>
46#include <linux/notifier.h>
47#include <linux/kthread.h>
48
49#define __KERNEL_SYSCALLS__
50#include <linux/unistd.h>
51#include <linux/vmalloc.h>
52
53#include <linux/drbd_limits.h>
54#include "drbd_int.h"
Philipp Reisnerb411b362009-09-25 16:07:19 -070055#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
56
57#include "drbd_vli.h"
58
59struct after_state_chg_work {
60 struct drbd_work w;
61 union drbd_state os;
62 union drbd_state ns;
63 enum chg_state_flags flags;
64 struct completion *done;
65};
66
Arnd Bergmann2a48fc02010-06-02 14:28:52 +020067static DEFINE_MUTEX(drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -070068int drbdd_init(struct drbd_thread *);
69int drbd_worker(struct drbd_thread *);
70int drbd_asender(struct drbd_thread *);
71
72int drbd_init(void);
73static int drbd_open(struct block_device *bdev, fmode_t mode);
74static int drbd_release(struct gendisk *gd, fmode_t mode);
75static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
76static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
77 union drbd_state ns, enum chg_state_flags flags);
78static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
79static void md_sync_timer_fn(unsigned long data);
80static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +020081static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
Philipp Reisnerb411b362009-09-25 16:07:19 -070082
Philipp Reisnerb411b362009-09-25 16:07:19 -070083MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
84 "Lars Ellenberg <lars@linbit.com>");
85MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
86MODULE_VERSION(REL_VERSION);
87MODULE_LICENSE("GPL");
Philipp Reisner2b8a90b2011-01-10 11:15:17 +010088MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices ("
89 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
Philipp Reisnerb411b362009-09-25 16:07:19 -070090MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
91
92#include <linux/moduleparam.h>
93/* allow_open_on_secondary */
94MODULE_PARM_DESC(allow_oos, "DONT USE!");
95/* thanks to these macros, if compiled into the kernel (not-module),
96 * this becomes the boot parameter drbd.minor_count */
97module_param(minor_count, uint, 0444);
98module_param(disable_sendpage, bool, 0644);
99module_param(allow_oos, bool, 0);
100module_param(cn_idx, uint, 0444);
101module_param(proc_details, int, 0644);
102
103#ifdef CONFIG_DRBD_FAULT_INJECTION
104int enable_faults;
105int fault_rate;
106static int fault_count;
107int fault_devs;
108/* bitmap of enabled faults */
109module_param(enable_faults, int, 0664);
110/* fault rate % value - applies to all enabled faults */
111module_param(fault_rate, int, 0664);
112/* count of faults inserted */
113module_param(fault_count, int, 0664);
114/* bitmap of devices to insert faults on */
115module_param(fault_devs, int, 0644);
116#endif
117
118/* module parameter, defined */
Philipp Reisner2b8a90b2011-01-10 11:15:17 +0100119unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700120int disable_sendpage;
121int allow_oos;
122unsigned int cn_idx = CN_IDX_DRBD;
123int proc_details; /* Detail level in proc drbd*/
124
125/* Module parameter for setting the user mode helper program
126 * to run. Default is /sbin/drbdadm */
127char usermode_helper[80] = "/sbin/drbdadm";
128
129module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
130
131/* in 2.6.x, our device mapping and config info contains our virtual gendisks
132 * as member "struct gendisk *vdisk;"
133 */
134struct drbd_conf **minor_table;
135
136struct kmem_cache *drbd_request_cache;
137struct kmem_cache *drbd_ee_cache; /* epoch entries */
138struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
139struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
140mempool_t *drbd_request_mempool;
141mempool_t *drbd_ee_mempool;
142
143/* I do not use a standard mempool, because:
144 1) I want to hand out the pre-allocated objects first.
145 2) I want to be able to interrupt sleeping allocation with a signal.
146 Note: This is a single linked list, the next pointer is the private
147 member of struct page.
148 */
149struct page *drbd_pp_pool;
150spinlock_t drbd_pp_lock;
151int drbd_pp_vacant;
152wait_queue_head_t drbd_pp_wait;
153
154DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
155
Emese Revfy7d4e9d02009-12-14 00:59:30 +0100156static const struct block_device_operations drbd_ops = {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700157 .owner = THIS_MODULE,
158 .open = drbd_open,
159 .release = drbd_release,
160};
161
162#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
163
164#ifdef __CHECKER__
165/* When checking with sparse, and this is an inline function, sparse will
166 give tons of false positives. When this is a real functions sparse works.
167 */
168int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
169{
170 int io_allowed;
171
172 atomic_inc(&mdev->local_cnt);
173 io_allowed = (mdev->state.disk >= mins);
174 if (!io_allowed) {
175 if (atomic_dec_and_test(&mdev->local_cnt))
176 wake_up(&mdev->misc_wait);
177 }
178 return io_allowed;
179}
180
181#endif
182
183/**
184 * DOC: The transfer log
185 *
186 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
187 * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
188 * of the list. There is always at least one &struct drbd_tl_epoch object.
189 *
190 * Each &struct drbd_tl_epoch has a circular double linked list of requests
191 * attached.
192 */
193static int tl_init(struct drbd_conf *mdev)
194{
195 struct drbd_tl_epoch *b;
196
197 /* during device minor initialization, we may well use GFP_KERNEL */
198 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
199 if (!b)
200 return 0;
201 INIT_LIST_HEAD(&b->requests);
202 INIT_LIST_HEAD(&b->w.list);
203 b->next = NULL;
204 b->br_number = 4711;
Philipp Reisner7e602c02010-05-27 14:49:27 +0200205 b->n_writes = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700206 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
207
208 mdev->oldest_tle = b;
209 mdev->newest_tle = b;
210 INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
211
212 mdev->tl_hash = NULL;
213 mdev->tl_hash_s = 0;
214
215 return 1;
216}
217
218static void tl_cleanup(struct drbd_conf *mdev)
219{
220 D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
221 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
222 kfree(mdev->oldest_tle);
223 mdev->oldest_tle = NULL;
224 kfree(mdev->unused_spare_tle);
225 mdev->unused_spare_tle = NULL;
226 kfree(mdev->tl_hash);
227 mdev->tl_hash = NULL;
228 mdev->tl_hash_s = 0;
229}
230
231/**
232 * _tl_add_barrier() - Adds a barrier to the transfer log
233 * @mdev: DRBD device.
234 * @new: Barrier to be added before the current head of the TL.
235 *
236 * The caller must hold the req_lock.
237 */
238void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
239{
240 struct drbd_tl_epoch *newest_before;
241
242 INIT_LIST_HEAD(&new->requests);
243 INIT_LIST_HEAD(&new->w.list);
244 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
245 new->next = NULL;
Philipp Reisner7e602c02010-05-27 14:49:27 +0200246 new->n_writes = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700247
248 newest_before = mdev->newest_tle;
249 /* never send a barrier number == 0, because that is special-cased
250 * when using TCQ for our write ordering code */
251 new->br_number = (newest_before->br_number+1) ?: 1;
252 if (mdev->newest_tle != new) {
253 mdev->newest_tle->next = new;
254 mdev->newest_tle = new;
255 }
256}
257
258/**
259 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
260 * @mdev: DRBD device.
261 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
262 * @set_size: Expected number of requests before that barrier.
263 *
264 * In case the passed barrier_nr or set_size does not match the oldest
265 * &struct drbd_tl_epoch objects this function will cause a termination
266 * of the connection.
267 */
268void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
269 unsigned int set_size)
270{
271 struct drbd_tl_epoch *b, *nob; /* next old barrier */
272 struct list_head *le, *tle;
273 struct drbd_request *r;
274
275 spin_lock_irq(&mdev->req_lock);
276
277 b = mdev->oldest_tle;
278
279 /* first some paranoia code */
280 if (b == NULL) {
281 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
282 barrier_nr);
283 goto bail;
284 }
285 if (b->br_number != barrier_nr) {
286 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
287 barrier_nr, b->br_number);
288 goto bail;
289 }
Philipp Reisner7e602c02010-05-27 14:49:27 +0200290 if (b->n_writes != set_size) {
291 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
292 barrier_nr, set_size, b->n_writes);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700293 goto bail;
294 }
295
296 /* Clean up list of requests processed during current epoch */
297 list_for_each_safe(le, tle, &b->requests) {
298 r = list_entry(le, struct drbd_request, tl_requests);
299 _req_mod(r, barrier_acked);
300 }
301 /* There could be requests on the list waiting for completion
302 of the write to the local disk. To avoid corruptions of
303 slab's data structures we have to remove the lists head.
304
305 Also there could have been a barrier ack out of sequence, overtaking
306 the write acks - which would be a bug and violating write ordering.
307 To not deadlock in case we lose connection while such requests are
308 still pending, we need some way to find them for the
309 _req_mode(connection_lost_while_pending).
310
311 These have been list_move'd to the out_of_sequence_requests list in
312 _req_mod(, barrier_acked) above.
313 */
314 list_del_init(&b->requests);
315
316 nob = b->next;
317 if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
318 _tl_add_barrier(mdev, b);
319 if (nob)
320 mdev->oldest_tle = nob;
321 /* if nob == NULL b was the only barrier, and becomes the new
322 barrier. Therefore mdev->oldest_tle points already to b */
323 } else {
324 D_ASSERT(nob != NULL);
325 mdev->oldest_tle = nob;
326 kfree(b);
327 }
328
329 spin_unlock_irq(&mdev->req_lock);
330 dec_ap_pending(mdev);
331
332 return;
333
334bail:
335 spin_unlock_irq(&mdev->req_lock);
336 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
337}
338
Philipp Reisner617049a2010-12-22 12:48:31 +0100339
Philipp Reisner11b58e72010-05-12 17:08:26 +0200340/**
341 * _tl_restart() - Walks the transfer log, and applies an action to all requests
342 * @mdev: DRBD device.
343 * @what: The action/event to perform with all request objects
344 *
345 * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
346 * restart_frozen_disk_io.
347 */
348static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
349{
350 struct drbd_tl_epoch *b, *tmp, **pn;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200351 struct list_head *le, *tle, carry_reads;
Philipp Reisner11b58e72010-05-12 17:08:26 +0200352 struct drbd_request *req;
353 int rv, n_writes, n_reads;
354
355 b = mdev->oldest_tle;
356 pn = &mdev->oldest_tle;
357 while (b) {
358 n_writes = 0;
359 n_reads = 0;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200360 INIT_LIST_HEAD(&carry_reads);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200361 list_for_each_safe(le, tle, &b->requests) {
362 req = list_entry(le, struct drbd_request, tl_requests);
363 rv = _req_mod(req, what);
364
365 n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
366 n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
367 }
368 tmp = b->next;
369
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200370 if (n_writes) {
Philipp Reisner11b58e72010-05-12 17:08:26 +0200371 if (what == resend) {
372 b->n_writes = n_writes;
373 if (b->w.cb == NULL) {
374 b->w.cb = w_send_barrier;
375 inc_ap_pending(mdev);
376 set_bit(CREATE_BARRIER, &mdev->flags);
377 }
378
379 drbd_queue_work(&mdev->data.work, &b->w);
380 }
381 pn = &b->next;
382 } else {
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200383 if (n_reads)
384 list_add(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200385 /* there could still be requests on that ring list,
386 * in case local io is still pending */
387 list_del(&b->requests);
388
389 /* dec_ap_pending corresponding to queue_barrier.
390 * the newest barrier may not have been queued yet,
391 * in which case w.cb is still NULL. */
392 if (b->w.cb != NULL)
393 dec_ap_pending(mdev);
394
395 if (b == mdev->newest_tle) {
396 /* recycle, but reinit! */
397 D_ASSERT(tmp == NULL);
398 INIT_LIST_HEAD(&b->requests);
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200399 list_splice(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200400 INIT_LIST_HEAD(&b->w.list);
401 b->w.cb = NULL;
402 b->br_number = net_random();
403 b->n_writes = 0;
404
405 *pn = b;
406 break;
407 }
408 *pn = tmp;
409 kfree(b);
410 }
411 b = tmp;
Philipp Reisnerb9b98712010-06-22 11:26:48 +0200412 list_splice(&carry_reads, &b->requests);
Philipp Reisner11b58e72010-05-12 17:08:26 +0200413 }
414}
415
Philipp Reisnerb411b362009-09-25 16:07:19 -0700416
417/**
418 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
419 * @mdev: DRBD device.
420 *
421 * This is called after the connection to the peer was lost. The storage covered
422 * by the requests on the transfer gets marked as our of sync. Called from the
423 * receiver thread and the worker thread.
424 */
425void tl_clear(struct drbd_conf *mdev)
426{
Philipp Reisnerb411b362009-09-25 16:07:19 -0700427 struct list_head *le, *tle;
428 struct drbd_request *r;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700429
430 spin_lock_irq(&mdev->req_lock);
431
Philipp Reisner11b58e72010-05-12 17:08:26 +0200432 _tl_restart(mdev, connection_lost_while_pending);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700433
434 /* we expect this list to be empty. */
435 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
436
437 /* but just in case, clean it up anyways! */
438 list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
439 r = list_entry(le, struct drbd_request, tl_requests);
440 /* It would be nice to complete outside of spinlock.
441 * But this is easier for now. */
442 _req_mod(r, connection_lost_while_pending);
443 }
444
445 /* ensure bit indicating barrier is required is clear */
446 clear_bit(CREATE_BARRIER, &mdev->flags);
447
Philipp Reisner288f4222010-05-27 15:07:43 +0200448 memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
449
Philipp Reisnerb411b362009-09-25 16:07:19 -0700450 spin_unlock_irq(&mdev->req_lock);
451}
452
Philipp Reisner11b58e72010-05-12 17:08:26 +0200453void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
454{
455 spin_lock_irq(&mdev->req_lock);
456 _tl_restart(mdev, what);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700457 spin_unlock_irq(&mdev->req_lock);
458}
459
460/**
Andreas Gruenbacher81e84652010-12-09 15:03:57 +0100461 * cl_wide_st_chg() - true if the state change is a cluster wide one
Philipp Reisnerb411b362009-09-25 16:07:19 -0700462 * @mdev: DRBD device.
463 * @os: old (current) state.
464 * @ns: new (wanted) state.
465 */
466static int cl_wide_st_chg(struct drbd_conf *mdev,
467 union drbd_state os, union drbd_state ns)
468{
469 return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
470 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
471 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
472 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
473 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
474 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
475 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
476}
477
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100478enum drbd_state_rv
479drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
480 union drbd_state mask, union drbd_state val)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700481{
482 unsigned long flags;
483 union drbd_state os, ns;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100484 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700485
486 spin_lock_irqsave(&mdev->req_lock, flags);
487 os = mdev->state;
488 ns.i = (os.i & ~mask.i) | val.i;
489 rv = _drbd_set_state(mdev, ns, f, NULL);
490 ns = mdev->state;
491 spin_unlock_irqrestore(&mdev->req_lock, flags);
492
493 return rv;
494}
495
496/**
497 * drbd_force_state() - Impose a change which happens outside our control on our state
498 * @mdev: DRBD device.
499 * @mask: mask of state bits to change.
500 * @val: value of new state bits.
501 */
502void drbd_force_state(struct drbd_conf *mdev,
503 union drbd_state mask, union drbd_state val)
504{
505 drbd_change_state(mdev, CS_HARD, mask, val);
506}
507
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100508static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
509static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
510 union drbd_state,
511 union drbd_state);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700512static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
Lars Ellenberg02bc7172010-09-06 12:13:20 +0200513 union drbd_state ns, const char **warn_sync_abort);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700514int drbd_send_state_req(struct drbd_conf *,
515 union drbd_state, union drbd_state);
516
Andreas Gruenbacherc8b32562010-12-08 01:06:16 +0100517static enum drbd_state_rv
518_req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
519 union drbd_state val)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700520{
521 union drbd_state os, ns;
522 unsigned long flags;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100523 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700524
525 if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
526 return SS_CW_SUCCESS;
527
528 if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
529 return SS_CW_FAILED_BY_PEER;
530
531 rv = 0;
532 spin_lock_irqsave(&mdev->req_lock, flags);
533 os = mdev->state;
534 ns.i = (os.i & ~mask.i) | val.i;
535 ns = sanitize_state(mdev, os, ns, NULL);
536
537 if (!cl_wide_st_chg(mdev, os, ns))
538 rv = SS_CW_NO_NEED;
539 if (!rv) {
540 rv = is_valid_state(mdev, ns);
541 if (rv == SS_SUCCESS) {
542 rv = is_valid_state_transition(mdev, ns, os);
543 if (rv == SS_SUCCESS)
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100544 rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700545 }
546 }
547 spin_unlock_irqrestore(&mdev->req_lock, flags);
548
549 return rv;
550}
551
552/**
553 * drbd_req_state() - Perform an eventually cluster wide state change
554 * @mdev: DRBD device.
555 * @mask: mask of state bits to change.
556 * @val: value of new state bits.
557 * @f: flags
558 *
559 * Should not be called directly, use drbd_request_state() or
560 * _drbd_request_state().
561 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100562static enum drbd_state_rv
563drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
564 union drbd_state val, enum chg_state_flags f)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700565{
566 struct completion done;
567 unsigned long flags;
568 union drbd_state os, ns;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100569 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700570
571 init_completion(&done);
572
573 if (f & CS_SERIALIZE)
574 mutex_lock(&mdev->state_mutex);
575
576 spin_lock_irqsave(&mdev->req_lock, flags);
577 os = mdev->state;
578 ns.i = (os.i & ~mask.i) | val.i;
579 ns = sanitize_state(mdev, os, ns, NULL);
580
581 if (cl_wide_st_chg(mdev, os, ns)) {
582 rv = is_valid_state(mdev, ns);
583 if (rv == SS_SUCCESS)
584 rv = is_valid_state_transition(mdev, ns, os);
585 spin_unlock_irqrestore(&mdev->req_lock, flags);
586
587 if (rv < SS_SUCCESS) {
588 if (f & CS_VERBOSE)
589 print_st_err(mdev, os, ns, rv);
590 goto abort;
591 }
592
593 drbd_state_lock(mdev);
594 if (!drbd_send_state_req(mdev, mask, val)) {
595 drbd_state_unlock(mdev);
596 rv = SS_CW_FAILED_BY_PEER;
597 if (f & CS_VERBOSE)
598 print_st_err(mdev, os, ns, rv);
599 goto abort;
600 }
601
602 wait_event(mdev->state_wait,
603 (rv = _req_st_cond(mdev, mask, val)));
604
605 if (rv < SS_SUCCESS) {
606 drbd_state_unlock(mdev);
607 if (f & CS_VERBOSE)
608 print_st_err(mdev, os, ns, rv);
609 goto abort;
610 }
611 spin_lock_irqsave(&mdev->req_lock, flags);
612 os = mdev->state;
613 ns.i = (os.i & ~mask.i) | val.i;
614 rv = _drbd_set_state(mdev, ns, f, &done);
615 drbd_state_unlock(mdev);
616 } else {
617 rv = _drbd_set_state(mdev, ns, f, &done);
618 }
619
620 spin_unlock_irqrestore(&mdev->req_lock, flags);
621
622 if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
623 D_ASSERT(current != mdev->worker.task);
624 wait_for_completion(&done);
625 }
626
627abort:
628 if (f & CS_SERIALIZE)
629 mutex_unlock(&mdev->state_mutex);
630
631 return rv;
632}
633
634/**
635 * _drbd_request_state() - Request a state change (with flags)
636 * @mdev: DRBD device.
637 * @mask: mask of state bits to change.
638 * @val: value of new state bits.
639 * @f: flags
640 *
641 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
642 * flag, or when logging of failed state change requests is not desired.
643 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100644enum drbd_state_rv
645_drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
646 union drbd_state val, enum chg_state_flags f)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700647{
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100648 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700649
650 wait_event(mdev->state_wait,
651 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
652
653 return rv;
654}
655
656static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
657{
658 dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
659 name,
660 drbd_conn_str(ns.conn),
661 drbd_role_str(ns.role),
662 drbd_role_str(ns.peer),
663 drbd_disk_str(ns.disk),
664 drbd_disk_str(ns.pdsk),
Philipp Reisnerfb22c402010-09-08 23:20:21 +0200665 is_susp(ns) ? 's' : 'r',
Philipp Reisnerb411b362009-09-25 16:07:19 -0700666 ns.aftr_isp ? 'a' : '-',
667 ns.peer_isp ? 'p' : '-',
668 ns.user_isp ? 'u' : '-'
669 );
670}
671
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100672void print_st_err(struct drbd_conf *mdev, union drbd_state os,
673 union drbd_state ns, enum drbd_state_rv err)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700674{
675 if (err == SS_IN_TRANSIENT_STATE)
676 return;
677 dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
678 print_st(mdev, " state", os);
679 print_st(mdev, "wanted", ns);
680}
681
682
Philipp Reisnerb411b362009-09-25 16:07:19 -0700683/**
684 * is_valid_state() - Returns an SS_ error code if ns is not valid
685 * @mdev: DRBD device.
686 * @ns: State to consider.
687 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100688static enum drbd_state_rv
689is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700690{
691 /* See drbd_state_sw_errors in drbd_strings.c */
692
693 enum drbd_fencing_p fp;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100694 enum drbd_state_rv rv = SS_SUCCESS;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700695
696 fp = FP_DONT_CARE;
697 if (get_ldev(mdev)) {
698 fp = mdev->ldev->dc.fencing;
699 put_ldev(mdev);
700 }
701
702 if (get_net_conf(mdev)) {
703 if (!mdev->net_conf->two_primaries &&
704 ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
705 rv = SS_TWO_PRIMARIES;
706 put_net_conf(mdev);
707 }
708
709 if (rv <= 0)
710 /* already found a reason to abort */;
711 else if (ns.role == R_SECONDARY && mdev->open_cnt)
712 rv = SS_DEVICE_IN_USE;
713
714 else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
715 rv = SS_NO_UP_TO_DATE_DISK;
716
717 else if (fp >= FP_RESOURCE &&
718 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
719 rv = SS_PRIMARY_NOP;
720
721 else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
722 rv = SS_NO_UP_TO_DATE_DISK;
723
724 else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
725 rv = SS_NO_LOCAL_DISK;
726
727 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
728 rv = SS_NO_REMOTE_DISK;
729
Lars Ellenberg8d4ce822010-04-01 16:59:32 +0200730 else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
731 rv = SS_NO_UP_TO_DATE_DISK;
732
Philipp Reisnerb411b362009-09-25 16:07:19 -0700733 else if ((ns.conn == C_CONNECTED ||
734 ns.conn == C_WF_BITMAP_S ||
735 ns.conn == C_SYNC_SOURCE ||
736 ns.conn == C_PAUSED_SYNC_S) &&
737 ns.disk == D_OUTDATED)
738 rv = SS_CONNECTED_OUTDATES;
739
740 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
741 (mdev->sync_conf.verify_alg[0] == 0))
742 rv = SS_NO_VERIFY_ALG;
743
744 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
745 mdev->agreed_pro_version < 88)
746 rv = SS_NOT_SUPPORTED;
747
748 return rv;
749}
750
751/**
752 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
753 * @mdev: DRBD device.
754 * @ns: new state.
755 * @os: old state.
756 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100757static enum drbd_state_rv
758is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
759 union drbd_state os)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700760{
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +0100761 enum drbd_state_rv rv = SS_SUCCESS;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700762
763 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
764 os.conn > C_CONNECTED)
765 rv = SS_RESYNC_RUNNING;
766
767 if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
768 rv = SS_ALREADY_STANDALONE;
769
770 if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
771 rv = SS_IS_DISKLESS;
772
773 if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
774 rv = SS_NO_NET_CONFIG;
775
776 if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
777 rv = SS_LOWER_THAN_OUTDATED;
778
779 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
780 rv = SS_IN_TRANSIENT_STATE;
781
782 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
783 rv = SS_IN_TRANSIENT_STATE;
784
785 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
786 rv = SS_NEED_CONNECTION;
787
788 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
789 ns.conn != os.conn && os.conn > C_CONNECTED)
790 rv = SS_RESYNC_RUNNING;
791
792 if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
793 os.conn < C_CONNECTED)
794 rv = SS_NEED_CONNECTION;
795
Philipp Reisner1fc80cf2010-11-22 14:18:47 +0100796 if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
797 && os.conn < C_WF_REPORT_PARAMS)
798 rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
799
Philipp Reisnerb411b362009-09-25 16:07:19 -0700800 return rv;
801}
802
803/**
804 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
805 * @mdev: DRBD device.
806 * @os: old state.
807 * @ns: new state.
808 * @warn_sync_abort:
809 *
810 * When we loose connection, we have to set the state of the peers disk (pdsk)
811 * to D_UNKNOWN. This rule and many more along those lines are in this function.
812 */
813static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
Lars Ellenberg02bc7172010-09-06 12:13:20 +0200814 union drbd_state ns, const char **warn_sync_abort)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700815{
816 enum drbd_fencing_p fp;
Philipp Reisnerab17b68f2010-11-17 16:54:36 +0100817 enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700818
819 fp = FP_DONT_CARE;
820 if (get_ldev(mdev)) {
821 fp = mdev->ldev->dc.fencing;
822 put_ldev(mdev);
823 }
824
825 /* Disallow Network errors to configure a device's network part */
826 if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
827 os.conn <= C_DISCONNECTING)
828 ns.conn = os.conn;
829
Lars Ellenbergf2906e12010-07-21 17:04:32 +0200830 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
831 * If you try to go into some Sync* state, that shall fail (elsewhere). */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700832 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
Lars Ellenbergf2906e12010-07-21 17:04:32 +0200833 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700834 ns.conn = os.conn;
835
Lars Ellenberg82f59cc2010-10-16 12:13:47 +0200836 /* we cannot fail (again) if we already detached */
837 if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
838 ns.disk = D_DISKLESS;
839
840 /* if we are only D_ATTACHING yet,
841 * we can (and should) go directly to D_DISKLESS. */
842 if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
843 ns.disk = D_DISKLESS;
844
Philipp Reisnerb411b362009-09-25 16:07:19 -0700845 /* After C_DISCONNECTING only C_STANDALONE may follow */
846 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
847 ns.conn = os.conn;
848
849 if (ns.conn < C_CONNECTED) {
850 ns.peer_isp = 0;
851 ns.peer = R_UNKNOWN;
852 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
853 ns.pdsk = D_UNKNOWN;
854 }
855
856 /* Clear the aftr_isp when becoming unconfigured */
857 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
858 ns.aftr_isp = 0;
859
Philipp Reisnerb411b362009-09-25 16:07:19 -0700860 /* Abort resync if a disk fails/detaches */
861 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
862 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
863 if (warn_sync_abort)
Lars Ellenberg02bc7172010-09-06 12:13:20 +0200864 *warn_sync_abort =
865 os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
866 "Online-verify" : "Resync";
Philipp Reisnerb411b362009-09-25 16:07:19 -0700867 ns.conn = C_CONNECTED;
868 }
869
Philipp Reisnerb411b362009-09-25 16:07:19 -0700870 /* Connection breaks down before we finished "Negotiating" */
871 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
872 get_ldev_if_state(mdev, D_NEGOTIATING)) {
873 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
874 ns.disk = mdev->new_state_tmp.disk;
875 ns.pdsk = mdev->new_state_tmp.pdsk;
876 } else {
877 dev_alert(DEV, "Connection lost while negotiating, no data!\n");
878 ns.disk = D_DISKLESS;
879 ns.pdsk = D_UNKNOWN;
880 }
881 put_ldev(mdev);
882 }
883
Philipp Reisnerab17b68f2010-11-17 16:54:36 +0100884 /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
885 if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
886 if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
887 ns.disk = D_UP_TO_DATE;
888 if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
889 ns.pdsk = D_UP_TO_DATE;
890 }
891
892 /* Implications of the connection stat on the disk states */
893 disk_min = D_DISKLESS;
894 disk_max = D_UP_TO_DATE;
895 pdsk_min = D_INCONSISTENT;
896 pdsk_max = D_UNKNOWN;
897 switch ((enum drbd_conns)ns.conn) {
898 case C_WF_BITMAP_T:
899 case C_PAUSED_SYNC_T:
900 case C_STARTING_SYNC_T:
901 case C_WF_SYNC_UUID:
902 case C_BEHIND:
903 disk_min = D_INCONSISTENT;
904 disk_max = D_OUTDATED;
905 pdsk_min = D_UP_TO_DATE;
906 pdsk_max = D_UP_TO_DATE;
907 break;
908 case C_VERIFY_S:
909 case C_VERIFY_T:
910 disk_min = D_UP_TO_DATE;
911 disk_max = D_UP_TO_DATE;
912 pdsk_min = D_UP_TO_DATE;
913 pdsk_max = D_UP_TO_DATE;
914 break;
915 case C_CONNECTED:
916 disk_min = D_DISKLESS;
917 disk_max = D_UP_TO_DATE;
918 pdsk_min = D_DISKLESS;
919 pdsk_max = D_UP_TO_DATE;
920 break;
921 case C_WF_BITMAP_S:
922 case C_PAUSED_SYNC_S:
923 case C_STARTING_SYNC_S:
924 case C_AHEAD:
925 disk_min = D_UP_TO_DATE;
926 disk_max = D_UP_TO_DATE;
927 pdsk_min = D_INCONSISTENT;
928 pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
929 break;
930 case C_SYNC_TARGET:
931 disk_min = D_INCONSISTENT;
932 disk_max = D_INCONSISTENT;
933 pdsk_min = D_UP_TO_DATE;
934 pdsk_max = D_UP_TO_DATE;
935 break;
936 case C_SYNC_SOURCE:
937 disk_min = D_UP_TO_DATE;
938 disk_max = D_UP_TO_DATE;
939 pdsk_min = D_INCONSISTENT;
940 pdsk_max = D_INCONSISTENT;
941 break;
942 case C_STANDALONE:
943 case C_DISCONNECTING:
944 case C_UNCONNECTED:
945 case C_TIMEOUT:
946 case C_BROKEN_PIPE:
947 case C_NETWORK_FAILURE:
948 case C_PROTOCOL_ERROR:
949 case C_TEAR_DOWN:
950 case C_WF_CONNECTION:
951 case C_WF_REPORT_PARAMS:
952 case C_MASK:
953 break;
954 }
955 if (ns.disk > disk_max)
956 ns.disk = disk_max;
957
958 if (ns.disk < disk_min) {
959 dev_warn(DEV, "Implicitly set disk from %s to %s\n",
960 drbd_disk_str(ns.disk), drbd_disk_str(disk_min));
961 ns.disk = disk_min;
962 }
963 if (ns.pdsk > pdsk_max)
964 ns.pdsk = pdsk_max;
965
966 if (ns.pdsk < pdsk_min) {
967 dev_warn(DEV, "Implicitly set pdsk from %s to %s\n",
968 drbd_disk_str(ns.pdsk), drbd_disk_str(pdsk_min));
969 ns.pdsk = pdsk_min;
970 }
971
Philipp Reisnerb411b362009-09-25 16:07:19 -0700972 if (fp == FP_STONITH &&
Philipp Reisner0a492162009-10-21 13:08:29 +0200973 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
974 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
Philipp Reisnerfb22c402010-09-08 23:20:21 +0200975 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
Philipp Reisner265be2d2010-05-31 10:14:17 +0200976
977 if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
978 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
979 !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
Philipp Reisnerfb22c402010-09-08 23:20:21 +0200980 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700981
982 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
983 if (ns.conn == C_SYNC_SOURCE)
984 ns.conn = C_PAUSED_SYNC_S;
985 if (ns.conn == C_SYNC_TARGET)
986 ns.conn = C_PAUSED_SYNC_T;
987 } else {
988 if (ns.conn == C_PAUSED_SYNC_S)
989 ns.conn = C_SYNC_SOURCE;
990 if (ns.conn == C_PAUSED_SYNC_T)
991 ns.conn = C_SYNC_TARGET;
992 }
993
994 return ns;
995}
996
997/* helper for __drbd_set_state */
998static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
999{
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001000 if (mdev->agreed_pro_version < 90)
1001 mdev->ov_start_sector = 0;
1002 mdev->rs_total = drbd_bm_bits(mdev);
1003 mdev->ov_position = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001004 if (cs == C_VERIFY_T) {
1005 /* starting online verify from an arbitrary position
1006 * does not fit well into the existing protocol.
1007 * on C_VERIFY_T, we initialize ov_left and friends
1008 * implicitly in receive_DataRequest once the
1009 * first P_OV_REQUEST is received */
1010 mdev->ov_start_sector = ~(sector_t)0;
1011 } else {
1012 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001013 if (bit >= mdev->rs_total) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001014 mdev->ov_start_sector =
1015 BM_BIT_TO_SECT(mdev->rs_total - 1);
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001016 mdev->rs_total = 1;
1017 } else
1018 mdev->rs_total -= bit;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001019 mdev->ov_position = mdev->ov_start_sector;
1020 }
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001021 mdev->ov_left = mdev->rs_total;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001022}
1023
Philipp Reisner07782862010-08-31 12:00:50 +02001024static void drbd_resume_al(struct drbd_conf *mdev)
1025{
1026 if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
1027 dev_info(DEV, "Resumed AL updates\n");
1028}
1029
Philipp Reisnerb411b362009-09-25 16:07:19 -07001030/**
1031 * __drbd_set_state() - Set a new DRBD state
1032 * @mdev: DRBD device.
1033 * @ns: new state.
1034 * @flags: Flags
1035 * @done: Optional completion, that will get completed after the after_state_ch() finished
1036 *
1037 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
1038 */
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +01001039enum drbd_state_rv
1040__drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
1041 enum chg_state_flags flags, struct completion *done)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001042{
1043 union drbd_state os;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +01001044 enum drbd_state_rv rv = SS_SUCCESS;
Lars Ellenberg02bc7172010-09-06 12:13:20 +02001045 const char *warn_sync_abort = NULL;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001046 struct after_state_chg_work *ascw;
1047
1048 os = mdev->state;
1049
1050 ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
1051
1052 if (ns.i == os.i)
1053 return SS_NOTHING_TO_DO;
1054
1055 if (!(flags & CS_HARD)) {
1056 /* pre-state-change checks ; only look at ns */
1057 /* See drbd_state_sw_errors in drbd_strings.c */
1058
1059 rv = is_valid_state(mdev, ns);
1060 if (rv < SS_SUCCESS) {
1061 /* If the old state was illegal as well, then let
1062 this happen...*/
1063
Philipp Reisner1616a252010-06-10 16:55:15 +02001064 if (is_valid_state(mdev, os) == rv)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001065 rv = is_valid_state_transition(mdev, ns, os);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001066 } else
1067 rv = is_valid_state_transition(mdev, ns, os);
1068 }
1069
1070 if (rv < SS_SUCCESS) {
1071 if (flags & CS_VERBOSE)
1072 print_st_err(mdev, os, ns, rv);
1073 return rv;
1074 }
1075
1076 if (warn_sync_abort)
Lars Ellenberg02bc7172010-09-06 12:13:20 +02001077 dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001078
1079 {
Andreas Gruenbacher662d91a2010-12-07 03:01:41 +01001080 char *pbp, pb[300];
1081 pbp = pb;
1082 *pbp = 0;
1083 if (ns.role != os.role)
1084 pbp += sprintf(pbp, "role( %s -> %s ) ",
1085 drbd_role_str(os.role),
1086 drbd_role_str(ns.role));
1087 if (ns.peer != os.peer)
1088 pbp += sprintf(pbp, "peer( %s -> %s ) ",
1089 drbd_role_str(os.peer),
1090 drbd_role_str(ns.peer));
1091 if (ns.conn != os.conn)
1092 pbp += sprintf(pbp, "conn( %s -> %s ) ",
1093 drbd_conn_str(os.conn),
1094 drbd_conn_str(ns.conn));
1095 if (ns.disk != os.disk)
1096 pbp += sprintf(pbp, "disk( %s -> %s ) ",
1097 drbd_disk_str(os.disk),
1098 drbd_disk_str(ns.disk));
1099 if (ns.pdsk != os.pdsk)
1100 pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
1101 drbd_disk_str(os.pdsk),
1102 drbd_disk_str(ns.pdsk));
1103 if (is_susp(ns) != is_susp(os))
1104 pbp += sprintf(pbp, "susp( %d -> %d ) ",
1105 is_susp(os),
1106 is_susp(ns));
1107 if (ns.aftr_isp != os.aftr_isp)
1108 pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
1109 os.aftr_isp,
1110 ns.aftr_isp);
1111 if (ns.peer_isp != os.peer_isp)
1112 pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
1113 os.peer_isp,
1114 ns.peer_isp);
1115 if (ns.user_isp != os.user_isp)
1116 pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
1117 os.user_isp,
1118 ns.user_isp);
1119 dev_info(DEV, "%s\n", pb);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001120 }
1121
1122 /* solve the race between becoming unconfigured,
1123 * worker doing the cleanup, and
1124 * admin reconfiguring us:
1125 * on (re)configure, first set CONFIG_PENDING,
1126 * then wait for a potentially exiting worker,
1127 * start the worker, and schedule one no_op.
1128 * then proceed with configuration.
1129 */
1130 if (ns.disk == D_DISKLESS &&
1131 ns.conn == C_STANDALONE &&
1132 ns.role == R_SECONDARY &&
1133 !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1134 set_bit(DEVICE_DYING, &mdev->flags);
1135
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001136 /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1137 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1138 * drbd_ldev_destroy() won't happen before our corresponding
1139 * after_state_ch works run, where we put_ldev again. */
1140 if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1141 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1142 atomic_inc(&mdev->local_cnt);
1143
1144 mdev->state = ns;
Lars Ellenberg62b0da32011-01-20 13:25:21 +01001145
1146 if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
1147 drbd_print_uuids(mdev, "attached to UUIDs");
1148
Philipp Reisnerb411b362009-09-25 16:07:19 -07001149 wake_up(&mdev->misc_wait);
1150 wake_up(&mdev->state_wait);
1151
Philipp Reisnerb411b362009-09-25 16:07:19 -07001152 /* aborted verify run. log the last position */
1153 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1154 ns.conn < C_CONNECTED) {
1155 mdev->ov_start_sector =
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001156 BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001157 dev_info(DEV, "Online Verify reached sector %llu\n",
1158 (unsigned long long)mdev->ov_start_sector);
1159 }
1160
1161 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1162 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1163 dev_info(DEV, "Syncer continues.\n");
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001164 mdev->rs_paused += (long)jiffies
1165 -(long)mdev->rs_mark_time[mdev->rs_last_mark];
Philipp Reisner63106d32010-09-01 15:47:15 +02001166 if (ns.conn == C_SYNC_TARGET)
1167 mod_timer(&mdev->resync_timer, jiffies);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001168 }
1169
1170 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1171 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1172 dev_info(DEV, "Resync suspended\n");
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001173 mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001174 }
1175
1176 if (os.conn == C_CONNECTED &&
1177 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001178 unsigned long now = jiffies;
1179 int i;
1180
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001181 set_ov_position(mdev, ns.conn);
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001182 mdev->rs_start = now;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02001183 mdev->rs_last_events = 0;
1184 mdev->rs_last_sect_ev = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001185 mdev->ov_last_oos_size = 0;
1186 mdev->ov_last_oos_start = 0;
1187
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001188 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
Lars Ellenberg30b743a2010-11-05 09:39:06 +01001189 mdev->rs_mark_left[i] = mdev->ov_left;
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02001190 mdev->rs_mark_time[i] = now;
1191 }
1192
Lars Ellenberg2649f082010-11-05 10:05:47 +01001193 drbd_rs_controller_reset(mdev);
1194
Philipp Reisnerb411b362009-09-25 16:07:19 -07001195 if (ns.conn == C_VERIFY_S) {
1196 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1197 (unsigned long long)mdev->ov_position);
1198 mod_timer(&mdev->resync_timer, jiffies);
1199 }
1200 }
1201
1202 if (get_ldev(mdev)) {
1203 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1204 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1205 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1206
1207 if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1208 mdf |= MDF_CRASHED_PRIMARY;
1209 if (mdev->state.role == R_PRIMARY ||
1210 (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1211 mdf |= MDF_PRIMARY_IND;
1212 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1213 mdf |= MDF_CONNECTED_IND;
1214 if (mdev->state.disk > D_INCONSISTENT)
1215 mdf |= MDF_CONSISTENT;
1216 if (mdev->state.disk > D_OUTDATED)
1217 mdf |= MDF_WAS_UP_TO_DATE;
1218 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1219 mdf |= MDF_PEER_OUT_DATED;
1220 if (mdf != mdev->ldev->md.flags) {
1221 mdev->ldev->md.flags = mdf;
1222 drbd_md_mark_dirty(mdev);
1223 }
1224 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1225 drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1226 put_ldev(mdev);
1227 }
1228
1229 /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1230 if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1231 os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1232 set_bit(CONSIDER_RESYNC, &mdev->flags);
1233
1234 /* Receiver should clean up itself */
1235 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1236 drbd_thread_stop_nowait(&mdev->receiver);
1237
1238 /* Now the receiver finished cleaning up itself, it should die */
1239 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1240 drbd_thread_stop_nowait(&mdev->receiver);
1241
1242 /* Upon network failure, we need to restart the receiver. */
1243 if (os.conn > C_TEAR_DOWN &&
1244 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1245 drbd_thread_restart_nowait(&mdev->receiver);
1246
Philipp Reisner07782862010-08-31 12:00:50 +02001247 /* Resume AL writing if we get a connection */
1248 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1249 drbd_resume_al(mdev);
1250
Philipp Reisnerb411b362009-09-25 16:07:19 -07001251 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1252 if (ascw) {
1253 ascw->os = os;
1254 ascw->ns = ns;
1255 ascw->flags = flags;
1256 ascw->w.cb = w_after_state_ch;
1257 ascw->done = done;
1258 drbd_queue_work(&mdev->data.work, &ascw->w);
1259 } else {
1260 dev_warn(DEV, "Could not kmalloc an ascw\n");
1261 }
1262
1263 return rv;
1264}
1265
1266static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1267{
1268 struct after_state_chg_work *ascw =
1269 container_of(w, struct after_state_chg_work, w);
1270 after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1271 if (ascw->flags & CS_WAIT_COMPLETE) {
1272 D_ASSERT(ascw->done != NULL);
1273 complete(ascw->done);
1274 }
1275 kfree(ascw);
1276
1277 return 1;
1278}
1279
1280static void abw_start_sync(struct drbd_conf *mdev, int rv)
1281{
1282 if (rv) {
1283 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1284 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1285 return;
1286 }
1287
1288 switch (mdev->state.conn) {
1289 case C_STARTING_SYNC_T:
1290 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1291 break;
1292 case C_STARTING_SYNC_S:
1293 drbd_start_resync(mdev, C_SYNC_SOURCE);
1294 break;
1295 }
1296}
1297
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001298int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
1299 int (*io_fn)(struct drbd_conf *),
1300 char *why, enum bm_flag flags)
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001301{
1302 int rv;
1303
1304 D_ASSERT(current == mdev->worker.task);
1305
1306 /* open coded non-blocking drbd_suspend_io(mdev); */
1307 set_bit(SUSPEND_IO, &mdev->flags);
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001308
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001309 drbd_bm_lock(mdev, why, flags);
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001310 rv = io_fn(mdev);
1311 drbd_bm_unlock(mdev);
1312
1313 drbd_resume_io(mdev);
1314
1315 return rv;
1316}
1317
Philipp Reisnerb411b362009-09-25 16:07:19 -07001318/**
1319 * after_state_ch() - Perform after state change actions that may sleep
1320 * @mdev: DRBD device.
1321 * @os: old state.
1322 * @ns: new state.
1323 * @flags: Flags
1324 */
1325static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1326 union drbd_state ns, enum chg_state_flags flags)
1327{
1328 enum drbd_fencing_p fp;
Philipp Reisner67098932010-06-24 16:24:25 +02001329 enum drbd_req_event what = nothing;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001330 union drbd_state nsm = (union drbd_state){ .i = -1 };
Philipp Reisnerb411b362009-09-25 16:07:19 -07001331
1332 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1333 clear_bit(CRASHED_PRIMARY, &mdev->flags);
1334 if (mdev->p_uuid)
1335 mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1336 }
1337
1338 fp = FP_DONT_CARE;
1339 if (get_ldev(mdev)) {
1340 fp = mdev->ldev->dc.fencing;
1341 put_ldev(mdev);
1342 }
1343
1344 /* Inform userspace about the change... */
1345 drbd_bcast_state(mdev, ns);
1346
1347 if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1348 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1349 drbd_khelper(mdev, "pri-on-incon-degr");
1350
1351 /* Here we have the actions that are performed after a
1352 state change. This function might sleep */
1353
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001354 nsm.i = -1;
1355 if (ns.susp_nod) {
Philipp Reisner3f986882010-12-20 14:48:20 +01001356 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1357 what = resend;
Philipp Reisner265be2d2010-05-31 10:14:17 +02001358
Philipp Reisner67098932010-06-24 16:24:25 +02001359 if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
Philipp Reisner3f986882010-12-20 14:48:20 +01001360 what = restart_frozen_disk_io;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001361
Philipp Reisner3f986882010-12-20 14:48:20 +01001362 if (what != nothing)
1363 nsm.susp_nod = 0;
Philipp Reisner265be2d2010-05-31 10:14:17 +02001364 }
1365
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001366 if (ns.susp_fen) {
Philipp Reisner43a51822010-06-11 11:26:34 +02001367 /* case1: The outdate peer handler is successful: */
1368 if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001369 tl_clear(mdev);
Philipp Reisner43a51822010-06-11 11:26:34 +02001370 if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1371 drbd_uuid_new_current(mdev);
1372 clear_bit(NEW_CUR_UUID, &mdev->flags);
Philipp Reisner43a51822010-06-11 11:26:34 +02001373 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001374 spin_lock_irq(&mdev->req_lock);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001375 _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001376 spin_unlock_irq(&mdev->req_lock);
1377 }
Philipp Reisner43a51822010-06-11 11:26:34 +02001378 /* case2: The connection was established again: */
1379 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1380 clear_bit(NEW_CUR_UUID, &mdev->flags);
Philipp Reisner67098932010-06-24 16:24:25 +02001381 what = resend;
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001382 nsm.susp_fen = 0;
Philipp Reisner43a51822010-06-11 11:26:34 +02001383 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001384 }
Philipp Reisner67098932010-06-24 16:24:25 +02001385
1386 if (what != nothing) {
1387 spin_lock_irq(&mdev->req_lock);
1388 _tl_restart(mdev, what);
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001389 nsm.i &= mdev->state.i;
1390 _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
Philipp Reisner67098932010-06-24 16:24:25 +02001391 spin_unlock_irq(&mdev->req_lock);
1392 }
1393
Lars Ellenberg5a22db82010-12-17 21:14:23 +01001394 /* Became sync source. With protocol >= 96, we still need to send out
1395 * the sync uuid now. Need to do that before any drbd_send_state, or
1396 * the other side may go "paused sync" before receiving the sync uuids,
1397 * which is unexpected. */
1398 if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
1399 (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
1400 mdev->agreed_pro_version >= 96 && get_ldev(mdev)) {
1401 drbd_gen_and_send_sync_uuid(mdev);
1402 put_ldev(mdev);
1403 }
1404
Philipp Reisnerb411b362009-09-25 16:07:19 -07001405 /* Do not change the order of the if above and the two below... */
1406 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
1407 drbd_send_uuids(mdev);
1408 drbd_send_state(mdev);
1409 }
Lars Ellenberg54b956a2011-01-20 10:47:53 +01001410 /* No point in queuing send_bitmap if we don't have a connection
1411 * anymore, so check also the _current_ state, not only the new state
1412 * at the time this work was queued. */
1413 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
1414 mdev->state.conn == C_WF_BITMAP_S)
1415 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001416 "send_bitmap (WFBitMapS)",
1417 BM_LOCKED_TEST_ALLOWED);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001418
1419 /* Lost contact to peer's copy of the data */
1420 if ((os.pdsk >= D_INCONSISTENT &&
1421 os.pdsk != D_UNKNOWN &&
1422 os.pdsk != D_OUTDATED)
1423 && (ns.pdsk < D_INCONSISTENT ||
1424 ns.pdsk == D_UNKNOWN ||
1425 ns.pdsk == D_OUTDATED)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001426 if (get_ldev(mdev)) {
1427 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001428 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001429 if (is_susp(mdev->state)) {
Philipp Reisner43a51822010-06-11 11:26:34 +02001430 set_bit(NEW_CUR_UUID, &mdev->flags);
1431 } else {
1432 drbd_uuid_new_current(mdev);
1433 drbd_send_uuids(mdev);
1434 }
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001435 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001436 put_ldev(mdev);
1437 }
1438 }
1439
1440 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
Philipp Reisner18a50fa2010-06-21 14:14:15 +02001441 if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
Philipp Reisner2c8d1962010-05-25 14:32:03 +02001442 drbd_uuid_new_current(mdev);
Philipp Reisner18a50fa2010-06-21 14:14:15 +02001443 drbd_send_uuids(mdev);
1444 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001445
1446 /* D_DISKLESS Peer becomes secondary */
1447 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001448 /* We may still be Primary ourselves.
1449 * No harm done if the bitmap still changes,
1450 * redirtied pages will follow later. */
1451 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1452 "demote diskless peer", BM_LOCKED_SET_ALLOWED);
Lars Ellenberg19f843a2010-12-15 08:59:11 +01001453 put_ldev(mdev);
1454 }
1455
Lars Ellenberg06d33e92010-12-18 17:00:59 +01001456 /* Write out all changed bits on demote.
1457 * Though, no need to da that just yet
1458 * if there is a resync going on still */
1459 if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
1460 mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001461 /* No changes to the bitmap expected this time, so assert that,
1462 * even though no harm was done if it did change. */
1463 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1464 "demote", BM_LOCKED_TEST_ALLOWED);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001465 put_ldev(mdev);
1466 }
1467
1468 /* Last part of the attaching process ... */
1469 if (ns.conn >= C_CONNECTED &&
1470 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
Philipp Reisnere89b5912010-03-24 17:11:33 +01001471 drbd_send_sizes(mdev, 0, 0); /* to start sync... */
Philipp Reisnerb411b362009-09-25 16:07:19 -07001472 drbd_send_uuids(mdev);
1473 drbd_send_state(mdev);
1474 }
1475
1476 /* We want to pause/continue resync, tell peer. */
1477 if (ns.conn >= C_CONNECTED &&
1478 ((os.aftr_isp != ns.aftr_isp) ||
1479 (os.user_isp != ns.user_isp)))
1480 drbd_send_state(mdev);
1481
1482 /* In case one of the isp bits got set, suspend other devices. */
1483 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1484 (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1485 suspend_other_sg(mdev);
1486
1487 /* Make sure the peer gets informed about eventual state
1488 changes (ISP bits) while we were in WFReportParams. */
1489 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1490 drbd_send_state(mdev);
1491
Philipp Reisner67531712010-10-27 12:21:30 +02001492 if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
1493 drbd_send_state(mdev);
1494
Philipp Reisnerb411b362009-09-25 16:07:19 -07001495 /* We are in the progress to start a full sync... */
1496 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1497 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001498 /* no other bitmap changes expected during this phase */
1499 drbd_queue_bitmap_io(mdev,
1500 &drbd_bmio_set_n_write, &abw_start_sync,
1501 "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001502
1503 /* We are invalidating our self... */
1504 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1505 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001506 /* other bitmap operation expected during this phase */
1507 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
1508 "set_n_write from invalidate", BM_LOCKED_MASK);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001509
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001510 /* first half of local IO error, failure to attach,
1511 * or administrative detach */
1512 if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1513 enum drbd_io_error_p eh;
1514 int was_io_error;
1515 /* corresponding get_ldev was in __drbd_set_state, to serialize
1516 * our cleanup here with the transition to D_DISKLESS,
1517 * so it is safe to dreference ldev here. */
1518 eh = mdev->ldev->dc.on_io_error;
1519 was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
1520
1521 /* current state still has to be D_FAILED,
1522 * there is only one way out: to D_DISKLESS,
1523 * and that may only happen after our put_ldev below. */
1524 if (mdev->state.disk != D_FAILED)
1525 dev_err(DEV,
1526 "ASSERT FAILED: disk is %s during detach\n",
1527 drbd_disk_str(mdev->state.disk));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001528
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001529 if (drbd_send_state(mdev))
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001530 dev_warn(DEV, "Notified peer that I am detaching my disk\n");
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001531 else
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001532 dev_err(DEV, "Sending state for detaching disk failed\n");
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001533
1534 drbd_rs_cancel_all(mdev);
1535
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001536 /* In case we want to get something to stable storage still,
1537 * this may be the last chance.
1538 * Following put_ldev may transition to D_DISKLESS. */
1539 drbd_md_sync(mdev);
1540 put_ldev(mdev);
1541
1542 if (was_io_error && eh == EP_CALL_HELPER)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001543 drbd_khelper(mdev, "local-io-error");
1544 }
1545
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001546 /* second half of local IO error, failure to attach,
1547 * or administrative detach,
1548 * after local_cnt references have reached zero again */
1549 if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1550 /* We must still be diskless,
1551 * re-attach has to be serialized with this! */
1552 if (mdev->state.disk != D_DISKLESS)
1553 dev_err(DEV,
1554 "ASSERT FAILED: disk is %s while going diskless\n",
1555 drbd_disk_str(mdev->state.disk));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001556
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001557 mdev->rs_total = 0;
1558 mdev->rs_failed = 0;
1559 atomic_set(&mdev->rs_pending_cnt, 0);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001560
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02001561 if (drbd_send_state(mdev))
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001562 dev_warn(DEV, "Notified peer that I'm now diskless.\n");
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001563 /* corresponding get_ldev in __drbd_set_state
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001564 * this may finally trigger drbd_ldev_destroy. */
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02001565 put_ldev(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001566 }
1567
Philipp Reisner738a84b2011-03-03 00:21:30 +01001568 /* Notify peer that I had a local IO error, and did not detached.. */
1569 if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT)
1570 drbd_send_state(mdev);
1571
Philipp Reisnerb411b362009-09-25 16:07:19 -07001572 /* Disks got bigger while they were detached */
1573 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1574 test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1575 if (ns.conn == C_CONNECTED)
1576 resync_after_online_grow(mdev);
1577 }
1578
1579 /* A resync finished or aborted, wake paused devices... */
1580 if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1581 (os.peer_isp && !ns.peer_isp) ||
1582 (os.user_isp && !ns.user_isp))
1583 resume_next_sg(mdev);
1584
Lars Ellenbergaf85e8e2010-10-07 16:07:55 +02001585 /* sync target done with resync. Explicitly notify peer, even though
1586 * it should (at least for non-empty resyncs) already know itself. */
1587 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1588 drbd_send_state(mdev);
1589
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001590 /* This triggers bitmap writeout of potentially still unwritten pages
1591 * if the resync finished cleanly, or aborted because of peer disk
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001592 * failure, or because of connection loss.
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001593 * For resync aborted because of local disk failure, we cannot do
1594 * any bitmap writeout anymore.
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001595 * No harm done if some bits change during this phase.
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001596 */
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01001597 if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
1598 drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL,
1599 "write from resync_finished", BM_LOCKED_SET_ALLOWED);
Lars Ellenberg79a30d22011-01-20 10:32:05 +01001600 put_ldev(mdev);
1601 }
Lars Ellenberg02851e92010-12-16 14:47:39 +01001602
Philipp Reisnerf70b35112010-06-24 14:34:40 +02001603 /* free tl_hash if we Got thawed and are C_STANDALONE */
Philipp Reisnerfb22c402010-09-08 23:20:21 +02001604 if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
Philipp Reisnerf70b35112010-06-24 14:34:40 +02001605 drbd_free_tl_hash(mdev);
1606
Philipp Reisnerb411b362009-09-25 16:07:19 -07001607 /* Upon network connection, we need to start the receiver */
1608 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1609 drbd_thread_start(&mdev->receiver);
1610
1611 /* Terminate worker thread if we are unconfigured - it will be
1612 restarted as needed... */
1613 if (ns.disk == D_DISKLESS &&
1614 ns.conn == C_STANDALONE &&
1615 ns.role == R_SECONDARY) {
1616 if (os.aftr_isp != ns.aftr_isp)
1617 resume_next_sg(mdev);
1618 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1619 if (test_bit(DEVICE_DYING, &mdev->flags))
1620 drbd_thread_stop_nowait(&mdev->worker);
1621 }
1622
1623 drbd_md_sync(mdev);
1624}
1625
1626
1627static int drbd_thread_setup(void *arg)
1628{
1629 struct drbd_thread *thi = (struct drbd_thread *) arg;
1630 struct drbd_conf *mdev = thi->mdev;
1631 unsigned long flags;
1632 int retval;
1633
1634restart:
1635 retval = thi->function(thi);
1636
1637 spin_lock_irqsave(&thi->t_lock, flags);
1638
1639 /* if the receiver has been "Exiting", the last thing it did
1640 * was set the conn state to "StandAlone",
1641 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1642 * and receiver thread will be "started".
1643 * drbd_thread_start needs to set "Restarting" in that case.
1644 * t_state check and assignment needs to be within the same spinlock,
1645 * so either thread_start sees Exiting, and can remap to Restarting,
1646 * or thread_start see None, and can proceed as normal.
1647 */
1648
1649 if (thi->t_state == Restarting) {
1650 dev_info(DEV, "Restarting %s\n", current->comm);
1651 thi->t_state = Running;
1652 spin_unlock_irqrestore(&thi->t_lock, flags);
1653 goto restart;
1654 }
1655
1656 thi->task = NULL;
1657 thi->t_state = None;
1658 smp_mb();
1659 complete(&thi->stop);
1660 spin_unlock_irqrestore(&thi->t_lock, flags);
1661
1662 dev_info(DEV, "Terminating %s\n", current->comm);
1663
1664 /* Release mod reference taken when thread was started */
1665 module_put(THIS_MODULE);
1666 return retval;
1667}
1668
1669static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1670 int (*func) (struct drbd_thread *))
1671{
1672 spin_lock_init(&thi->t_lock);
1673 thi->task = NULL;
1674 thi->t_state = None;
1675 thi->function = func;
1676 thi->mdev = mdev;
1677}
1678
1679int drbd_thread_start(struct drbd_thread *thi)
1680{
1681 struct drbd_conf *mdev = thi->mdev;
1682 struct task_struct *nt;
1683 unsigned long flags;
1684
1685 const char *me =
1686 thi == &mdev->receiver ? "receiver" :
1687 thi == &mdev->asender ? "asender" :
1688 thi == &mdev->worker ? "worker" : "NONSENSE";
1689
1690 /* is used from state engine doing drbd_thread_stop_nowait,
1691 * while holding the req lock irqsave */
1692 spin_lock_irqsave(&thi->t_lock, flags);
1693
1694 switch (thi->t_state) {
1695 case None:
1696 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1697 me, current->comm, current->pid);
1698
1699 /* Get ref on module for thread - this is released when thread exits */
1700 if (!try_module_get(THIS_MODULE)) {
1701 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1702 spin_unlock_irqrestore(&thi->t_lock, flags);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001703 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001704 }
1705
1706 init_completion(&thi->stop);
1707 D_ASSERT(thi->task == NULL);
1708 thi->reset_cpu_mask = 1;
1709 thi->t_state = Running;
1710 spin_unlock_irqrestore(&thi->t_lock, flags);
1711 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1712
1713 nt = kthread_create(drbd_thread_setup, (void *) thi,
1714 "drbd%d_%s", mdev_to_minor(mdev), me);
1715
1716 if (IS_ERR(nt)) {
1717 dev_err(DEV, "Couldn't start thread\n");
1718
1719 module_put(THIS_MODULE);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001720 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001721 }
1722 spin_lock_irqsave(&thi->t_lock, flags);
1723 thi->task = nt;
1724 thi->t_state = Running;
1725 spin_unlock_irqrestore(&thi->t_lock, flags);
1726 wake_up_process(nt);
1727 break;
1728 case Exiting:
1729 thi->t_state = Restarting;
1730 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1731 me, current->comm, current->pid);
1732 /* fall through */
1733 case Running:
1734 case Restarting:
1735 default:
1736 spin_unlock_irqrestore(&thi->t_lock, flags);
1737 break;
1738 }
1739
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001740 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001741}
1742
1743
1744void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1745{
1746 unsigned long flags;
1747
1748 enum drbd_thread_state ns = restart ? Restarting : Exiting;
1749
1750 /* may be called from state engine, holding the req lock irqsave */
1751 spin_lock_irqsave(&thi->t_lock, flags);
1752
1753 if (thi->t_state == None) {
1754 spin_unlock_irqrestore(&thi->t_lock, flags);
1755 if (restart)
1756 drbd_thread_start(thi);
1757 return;
1758 }
1759
1760 if (thi->t_state != ns) {
1761 if (thi->task == NULL) {
1762 spin_unlock_irqrestore(&thi->t_lock, flags);
1763 return;
1764 }
1765
1766 thi->t_state = ns;
1767 smp_mb();
1768 init_completion(&thi->stop);
1769 if (thi->task != current)
1770 force_sig(DRBD_SIGKILL, thi->task);
1771
1772 }
1773
1774 spin_unlock_irqrestore(&thi->t_lock, flags);
1775
1776 if (wait)
1777 wait_for_completion(&thi->stop);
1778}
1779
1780#ifdef CONFIG_SMP
1781/**
1782 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1783 * @mdev: DRBD device.
1784 *
1785 * Forces all threads of a device onto the same CPU. This is beneficial for
1786 * DRBD's performance. May be overwritten by user's configuration.
1787 */
1788void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1789{
1790 int ord, cpu;
1791
1792 /* user override. */
1793 if (cpumask_weight(mdev->cpu_mask))
1794 return;
1795
1796 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1797 for_each_online_cpu(cpu) {
1798 if (ord-- == 0) {
1799 cpumask_set_cpu(cpu, mdev->cpu_mask);
1800 return;
1801 }
1802 }
1803 /* should not be reached */
1804 cpumask_setall(mdev->cpu_mask);
1805}
1806
1807/**
1808 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1809 * @mdev: DRBD device.
1810 *
1811 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1812 * prematurely.
1813 */
1814void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1815{
1816 struct task_struct *p = current;
1817 struct drbd_thread *thi =
1818 p == mdev->asender.task ? &mdev->asender :
1819 p == mdev->receiver.task ? &mdev->receiver :
1820 p == mdev->worker.task ? &mdev->worker :
1821 NULL;
1822 ERR_IF(thi == NULL)
1823 return;
1824 if (!thi->reset_cpu_mask)
1825 return;
1826 thi->reset_cpu_mask = 0;
1827 set_cpus_allowed_ptr(p, mdev->cpu_mask);
1828}
1829#endif
1830
1831/* the appropriate socket mutex must be held already */
1832int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001833 enum drbd_packets cmd, struct p_header80 *h,
Philipp Reisnerb411b362009-09-25 16:07:19 -07001834 size_t size, unsigned msg_flags)
1835{
1836 int sent, ok;
1837
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001838 ERR_IF(!h) return false;
1839 ERR_IF(!size) return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001840
1841 h->magic = BE_DRBD_MAGIC;
1842 h->command = cpu_to_be16(cmd);
Philipp Reisner0b70a132010-08-20 13:36:10 +02001843 h->length = cpu_to_be16(size-sizeof(struct p_header80));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001844
Philipp Reisnerb411b362009-09-25 16:07:19 -07001845 sent = drbd_send(mdev, sock, h, size, msg_flags);
1846
1847 ok = (sent == size);
Lars Ellenberg0ddc5542011-01-21 12:35:15 +01001848 if (!ok && !signal_pending(current))
1849 dev_warn(DEV, "short sent %s size=%d sent=%d\n",
Philipp Reisnerb411b362009-09-25 16:07:19 -07001850 cmdname(cmd), (int)size, sent);
1851 return ok;
1852}
1853
1854/* don't pass the socket. we may only look at it
1855 * when we hold the appropriate socket mutex.
1856 */
1857int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001858 enum drbd_packets cmd, struct p_header80 *h, size_t size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001859{
1860 int ok = 0;
1861 struct socket *sock;
1862
1863 if (use_data_socket) {
1864 mutex_lock(&mdev->data.mutex);
1865 sock = mdev->data.socket;
1866 } else {
1867 mutex_lock(&mdev->meta.mutex);
1868 sock = mdev->meta.socket;
1869 }
1870
1871 /* drbd_disconnect() could have called drbd_free_sock()
1872 * while we were waiting in down()... */
1873 if (likely(sock != NULL))
1874 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1875
1876 if (use_data_socket)
1877 mutex_unlock(&mdev->data.mutex);
1878 else
1879 mutex_unlock(&mdev->meta.mutex);
1880 return ok;
1881}
1882
1883int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1884 size_t size)
1885{
Philipp Reisner0b70a132010-08-20 13:36:10 +02001886 struct p_header80 h;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001887 int ok;
1888
1889 h.magic = BE_DRBD_MAGIC;
1890 h.command = cpu_to_be16(cmd);
1891 h.length = cpu_to_be16(size);
1892
1893 if (!drbd_get_data_sock(mdev))
1894 return 0;
1895
Philipp Reisnerb411b362009-09-25 16:07:19 -07001896 ok = (sizeof(h) ==
1897 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1898 ok = ok && (size ==
1899 drbd_send(mdev, mdev->data.socket, data, size, 0));
1900
1901 drbd_put_data_sock(mdev);
1902
1903 return ok;
1904}
1905
1906int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1907{
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001908 struct p_rs_param_95 *p;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001909 struct socket *sock;
1910 int size, rv;
1911 const int apv = mdev->agreed_pro_version;
1912
1913 size = apv <= 87 ? sizeof(struct p_rs_param)
1914 : apv == 88 ? sizeof(struct p_rs_param)
1915 + strlen(mdev->sync_conf.verify_alg) + 1
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001916 : apv <= 94 ? sizeof(struct p_rs_param_89)
1917 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001918
1919 /* used from admin command context and receiver/worker context.
1920 * to avoid kmalloc, grab the socket right here,
1921 * then use the pre-allocated sbuf there */
1922 mutex_lock(&mdev->data.mutex);
1923 sock = mdev->data.socket;
1924
1925 if (likely(sock != NULL)) {
1926 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1927
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001928 p = &mdev->data.sbuf.rs_param_95;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001929
1930 /* initialize verify_alg and csums_alg */
1931 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1932
1933 p->rate = cpu_to_be32(sc->rate);
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02001934 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
1935 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
1936 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
1937 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001938
1939 if (apv >= 88)
1940 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1941 if (apv >= 89)
1942 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1943
1944 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1945 } else
1946 rv = 0; /* not ok */
1947
1948 mutex_unlock(&mdev->data.mutex);
1949
1950 return rv;
1951}
1952
1953int drbd_send_protocol(struct drbd_conf *mdev)
1954{
1955 struct p_protocol *p;
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001956 int size, cf, rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001957
1958 size = sizeof(struct p_protocol);
1959
1960 if (mdev->agreed_pro_version >= 87)
1961 size += strlen(mdev->net_conf->integrity_alg) + 1;
1962
1963 /* we must not recurse into our own queue,
1964 * as that is blocked during handshake */
1965 p = kmalloc(size, GFP_NOIO);
1966 if (p == NULL)
1967 return 0;
1968
1969 p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
1970 p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
1971 p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
1972 p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001973 p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
1974
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001975 cf = 0;
1976 if (mdev->net_conf->want_lose)
1977 cf |= CF_WANT_LOSE;
1978 if (mdev->net_conf->dry_run) {
1979 if (mdev->agreed_pro_version >= 92)
1980 cf |= CF_DRY_RUN;
1981 else {
1982 dev_err(DEV, "--dry-run is not supported by peer");
Dan Carpenter7ac314c2010-04-22 14:27:23 +02001983 kfree(p);
Philipp Reisner148efa12011-01-15 00:21:15 +01001984 return -1;
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01001985 }
1986 }
1987 p->conn_flags = cpu_to_be32(cf);
1988
Philipp Reisnerb411b362009-09-25 16:07:19 -07001989 if (mdev->agreed_pro_version >= 87)
1990 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
1991
1992 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
Philipp Reisner0b70a132010-08-20 13:36:10 +02001993 (struct p_header80 *)p, size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001994 kfree(p);
1995 return rv;
1996}
1997
1998int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
1999{
2000 struct p_uuids p;
2001 int i;
2002
2003 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
2004 return 1;
2005
2006 for (i = UI_CURRENT; i < UI_SIZE; i++)
2007 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
2008
2009 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
2010 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
2011 uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
2012 uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
2013 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
2014 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
2015
2016 put_ldev(mdev);
2017
2018 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002019 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002020}
2021
2022int drbd_send_uuids(struct drbd_conf *mdev)
2023{
2024 return _drbd_send_uuids(mdev, 0);
2025}
2026
2027int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
2028{
2029 return _drbd_send_uuids(mdev, 8);
2030}
2031
Lars Ellenberg62b0da32011-01-20 13:25:21 +01002032void drbd_print_uuids(struct drbd_conf *mdev, const char *text)
2033{
2034 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2035 u64 *uuid = mdev->ldev->md.uuid;
2036 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX\n",
2037 text,
2038 (unsigned long long)uuid[UI_CURRENT],
2039 (unsigned long long)uuid[UI_BITMAP],
2040 (unsigned long long)uuid[UI_HISTORY_START],
2041 (unsigned long long)uuid[UI_HISTORY_END]);
2042 put_ldev(mdev);
2043 } else {
2044 dev_info(DEV, "%s effective data uuid: %016llX\n",
2045 text,
2046 (unsigned long long)mdev->ed_uuid);
2047 }
2048}
2049
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002050int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002051{
2052 struct p_rs_uuid p;
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002053 u64 uuid;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002054
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002055 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
2056
Philipp Reisner4a23f262011-01-11 17:42:17 +01002057 uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET;
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002058 drbd_uuid_set(mdev, UI_BITMAP, uuid);
Lars Ellenberg62b0da32011-01-20 13:25:21 +01002059 drbd_print_uuids(mdev, "updated sync UUID");
Lars Ellenberg5a22db82010-12-17 21:14:23 +01002060 drbd_md_sync(mdev);
2061 p.uuid = cpu_to_be64(uuid);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002062
2063 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002064 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002065}
2066
Philipp Reisnere89b5912010-03-24 17:11:33 +01002067int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002068{
2069 struct p_sizes p;
2070 sector_t d_size, u_size;
2071 int q_order_type;
2072 int ok;
2073
2074 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2075 D_ASSERT(mdev->ldev->backing_bdev);
2076 d_size = drbd_get_max_capacity(mdev->ldev);
2077 u_size = mdev->ldev->dc.disk_size;
2078 q_order_type = drbd_queue_order_type(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002079 put_ldev(mdev);
2080 } else {
2081 d_size = 0;
2082 u_size = 0;
2083 q_order_type = QUEUE_ORDERED_NONE;
2084 }
2085
2086 p.d_size = cpu_to_be64(d_size);
2087 p.u_size = cpu_to_be64(u_size);
2088 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
Lars Ellenberg1816a2b2010-11-11 15:19:07 +01002089 p.max_bio_size = cpu_to_be32(queue_max_hw_sectors(mdev->rq_queue) << 9);
Philipp Reisnere89b5912010-03-24 17:11:33 +01002090 p.queue_order_type = cpu_to_be16(q_order_type);
2091 p.dds_flags = cpu_to_be16(flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002092
2093 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002094 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002095 return ok;
2096}
2097
2098/**
2099 * drbd_send_state() - Sends the drbd state to the peer
2100 * @mdev: DRBD device.
2101 */
2102int drbd_send_state(struct drbd_conf *mdev)
2103{
2104 struct socket *sock;
2105 struct p_state p;
2106 int ok = 0;
2107
2108 /* Grab state lock so we wont send state if we're in the middle
2109 * of a cluster wide state change on another thread */
2110 drbd_state_lock(mdev);
2111
2112 mutex_lock(&mdev->data.mutex);
2113
2114 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
2115 sock = mdev->data.socket;
2116
2117 if (likely(sock != NULL)) {
2118 ok = _drbd_send_cmd(mdev, sock, P_STATE,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002119 (struct p_header80 *)&p, sizeof(p), 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002120 }
2121
2122 mutex_unlock(&mdev->data.mutex);
2123
2124 drbd_state_unlock(mdev);
2125 return ok;
2126}
2127
2128int drbd_send_state_req(struct drbd_conf *mdev,
2129 union drbd_state mask, union drbd_state val)
2130{
2131 struct p_req_state p;
2132
2133 p.mask = cpu_to_be32(mask.i);
2134 p.val = cpu_to_be32(val.i);
2135
2136 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002137 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002138}
2139
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +01002140int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002141{
2142 struct p_req_state_reply p;
2143
2144 p.retcode = cpu_to_be32(retcode);
2145
2146 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002147 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002148}
2149
2150int fill_bitmap_rle_bits(struct drbd_conf *mdev,
2151 struct p_compressed_bm *p,
2152 struct bm_xfer_ctx *c)
2153{
2154 struct bitstream bs;
2155 unsigned long plain_bits;
2156 unsigned long tmp;
2157 unsigned long rl;
2158 unsigned len;
2159 unsigned toggle;
2160 int bits;
2161
2162 /* may we use this feature? */
2163 if ((mdev->sync_conf.use_rle == 0) ||
2164 (mdev->agreed_pro_version < 90))
2165 return 0;
2166
2167 if (c->bit_offset >= c->bm_bits)
2168 return 0; /* nothing to do. */
2169
2170 /* use at most thus many bytes */
2171 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
2172 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
2173 /* plain bits covered in this code string */
2174 plain_bits = 0;
2175
2176 /* p->encoding & 0x80 stores whether the first run length is set.
2177 * bit offset is implicit.
2178 * start with toggle == 2 to be able to tell the first iteration */
2179 toggle = 2;
2180
2181 /* see how much plain bits we can stuff into one packet
2182 * using RLE and VLI. */
2183 do {
2184 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
2185 : _drbd_bm_find_next(mdev, c->bit_offset);
2186 if (tmp == -1UL)
2187 tmp = c->bm_bits;
2188 rl = tmp - c->bit_offset;
2189
2190 if (toggle == 2) { /* first iteration */
2191 if (rl == 0) {
2192 /* the first checked bit was set,
2193 * store start value, */
2194 DCBP_set_start(p, 1);
2195 /* but skip encoding of zero run length */
2196 toggle = !toggle;
2197 continue;
2198 }
2199 DCBP_set_start(p, 0);
2200 }
2201
2202 /* paranoia: catch zero runlength.
2203 * can only happen if bitmap is modified while we scan it. */
2204 if (rl == 0) {
2205 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
2206 "t:%u bo:%lu\n", toggle, c->bit_offset);
2207 return -1;
2208 }
2209
2210 bits = vli_encode_bits(&bs, rl);
2211 if (bits == -ENOBUFS) /* buffer full */
2212 break;
2213 if (bits <= 0) {
2214 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
2215 return 0;
2216 }
2217
2218 toggle = !toggle;
2219 plain_bits += rl;
2220 c->bit_offset = tmp;
2221 } while (c->bit_offset < c->bm_bits);
2222
2223 len = bs.cur.b - p->code + !!bs.cur.bit;
2224
2225 if (plain_bits < (len << 3)) {
2226 /* incompressible with this method.
2227 * we need to rewind both word and bit position. */
2228 c->bit_offset -= plain_bits;
2229 bm_xfer_ctx_bit_to_word_offset(c);
2230 c->bit_offset = c->word_offset * BITS_PER_LONG;
2231 return 0;
2232 }
2233
2234 /* RLE + VLI was able to compress it just fine.
2235 * update c->word_offset. */
2236 bm_xfer_ctx_bit_to_word_offset(c);
2237
2238 /* store pad_bits */
2239 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2240
2241 return len;
2242}
2243
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002244/**
2245 * send_bitmap_rle_or_plain
2246 *
2247 * Return 0 when done, 1 when another iteration is needed, and a negative error
2248 * code upon failure.
2249 */
2250static int
Philipp Reisnerb411b362009-09-25 16:07:19 -07002251send_bitmap_rle_or_plain(struct drbd_conf *mdev,
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002252 struct p_header80 *h, struct bm_xfer_ctx *c)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002253{
2254 struct p_compressed_bm *p = (void*)h;
2255 unsigned long num_words;
2256 int len;
2257 int ok;
2258
2259 len = fill_bitmap_rle_bits(mdev, p, c);
2260
2261 if (len < 0)
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002262 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002263
2264 if (len) {
2265 DCBP_set_code(p, RLE_VLI_Bits);
2266 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
2267 sizeof(*p) + len, 0);
2268
2269 c->packets[0]++;
2270 c->bytes[0] += sizeof(*p) + len;
2271
2272 if (c->bit_offset >= c->bm_bits)
2273 len = 0; /* DONE */
2274 } else {
2275 /* was not compressible.
2276 * send a buffer full of plain text bits instead. */
2277 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2278 len = num_words * sizeof(long);
2279 if (len)
2280 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
2281 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002282 h, sizeof(struct p_header80) + len, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002283 c->word_offset += num_words;
2284 c->bit_offset = c->word_offset * BITS_PER_LONG;
2285
2286 c->packets[1]++;
Philipp Reisner0b70a132010-08-20 13:36:10 +02002287 c->bytes[1] += sizeof(struct p_header80) + len;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002288
2289 if (c->bit_offset > c->bm_bits)
2290 c->bit_offset = c->bm_bits;
2291 }
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002292 if (ok) {
2293 if (len == 0) {
2294 INFO_bm_xfer_stats(mdev, "send", c);
2295 return 0;
2296 } else
2297 return 1;
2298 }
2299 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002300}
2301
2302/* See the comment at receive_bitmap() */
2303int _drbd_send_bitmap(struct drbd_conf *mdev)
2304{
2305 struct bm_xfer_ctx c;
Philipp Reisner0b70a132010-08-20 13:36:10 +02002306 struct p_header80 *p;
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002307 int err;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002308
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002309 ERR_IF(!mdev->bitmap) return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002310
2311 /* maybe we should use some per thread scratch page,
2312 * and allocate that during initial device creation? */
Philipp Reisner0b70a132010-08-20 13:36:10 +02002313 p = (struct p_header80 *) __get_free_page(GFP_NOIO);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002314 if (!p) {
2315 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002316 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002317 }
2318
2319 if (get_ldev(mdev)) {
2320 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2321 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2322 drbd_bm_set_all(mdev);
2323 if (drbd_bm_write(mdev)) {
2324 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2325 * but otherwise process as per normal - need to tell other
2326 * side that a full resync is required! */
2327 dev_err(DEV, "Failed to write bitmap to disk!\n");
2328 } else {
2329 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2330 drbd_md_sync(mdev);
2331 }
2332 }
2333 put_ldev(mdev);
2334 }
2335
2336 c = (struct bm_xfer_ctx) {
2337 .bm_bits = drbd_bm_bits(mdev),
2338 .bm_words = drbd_bm_words(mdev),
2339 };
2340
2341 do {
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002342 err = send_bitmap_rle_or_plain(mdev, p, &c);
2343 } while (err > 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002344
2345 free_page((unsigned long) p);
Andreas Gruenbacherf70af112010-12-11 18:51:50 +01002346 return err == 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002347}
2348
2349int drbd_send_bitmap(struct drbd_conf *mdev)
2350{
2351 int err;
2352
2353 if (!drbd_get_data_sock(mdev))
2354 return -1;
2355 err = !_drbd_send_bitmap(mdev);
2356 drbd_put_data_sock(mdev);
2357 return err;
2358}
2359
2360int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2361{
2362 int ok;
2363 struct p_barrier_ack p;
2364
2365 p.barrier = barrier_nr;
2366 p.set_size = cpu_to_be32(set_size);
2367
2368 if (mdev->state.conn < C_CONNECTED)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002369 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002370 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002371 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002372 return ok;
2373}
2374
2375/**
2376 * _drbd_send_ack() - Sends an ack packet
2377 * @mdev: DRBD device.
2378 * @cmd: Packet command code.
2379 * @sector: sector, needs to be in big endian byte order
2380 * @blksize: size in byte, needs to be in big endian byte order
2381 * @block_id: Id, big endian byte order
2382 */
2383static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2384 u64 sector,
2385 u32 blksize,
2386 u64 block_id)
2387{
2388 int ok;
2389 struct p_block_ack p;
2390
2391 p.sector = sector;
2392 p.block_id = block_id;
2393 p.blksize = blksize;
2394 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2395
2396 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002397 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002398 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002399 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002400 return ok;
2401}
2402
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002403/* dp->sector and dp->block_id already/still in network byte order,
2404 * data_size is payload size according to dp->head,
2405 * and may need to be corrected for digest size. */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002406int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002407 struct p_data *dp, int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002408{
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02002409 data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
2410 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002411 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2412 dp->block_id);
2413}
2414
2415int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2416 struct p_block_req *rp)
2417{
2418 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2419}
2420
2421/**
2422 * drbd_send_ack() - Sends an ack packet
2423 * @mdev: DRBD device.
2424 * @cmd: Packet command code.
2425 * @e: Epoch entry.
2426 */
2427int drbd_send_ack(struct drbd_conf *mdev,
2428 enum drbd_packets cmd, struct drbd_epoch_entry *e)
2429{
2430 return _drbd_send_ack(mdev, cmd,
2431 cpu_to_be64(e->sector),
2432 cpu_to_be32(e->size),
2433 e->block_id);
2434}
2435
2436/* This function misuses the block_id field to signal if the blocks
2437 * are is sync or not. */
2438int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2439 sector_t sector, int blksize, u64 block_id)
2440{
2441 return _drbd_send_ack(mdev, cmd,
2442 cpu_to_be64(sector),
2443 cpu_to_be32(blksize),
2444 cpu_to_be64(block_id));
2445}
2446
2447int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2448 sector_t sector, int size, u64 block_id)
2449{
2450 int ok;
2451 struct p_block_req p;
2452
2453 p.sector = cpu_to_be64(sector);
2454 p.block_id = block_id;
2455 p.blksize = cpu_to_be32(size);
2456
2457 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002458 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002459 return ok;
2460}
2461
2462int drbd_send_drequest_csum(struct drbd_conf *mdev,
2463 sector_t sector, int size,
2464 void *digest, int digest_size,
2465 enum drbd_packets cmd)
2466{
2467 int ok;
2468 struct p_block_req p;
2469
2470 p.sector = cpu_to_be64(sector);
2471 p.block_id = BE_DRBD_MAGIC + 0xbeef;
2472 p.blksize = cpu_to_be32(size);
2473
2474 p.head.magic = BE_DRBD_MAGIC;
2475 p.head.command = cpu_to_be16(cmd);
Philipp Reisner0b70a132010-08-20 13:36:10 +02002476 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002477
2478 mutex_lock(&mdev->data.mutex);
2479
2480 ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2481 ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2482
2483 mutex_unlock(&mdev->data.mutex);
2484
2485 return ok;
2486}
2487
2488int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2489{
2490 int ok;
2491 struct p_block_req p;
2492
2493 p.sector = cpu_to_be64(sector);
2494 p.block_id = BE_DRBD_MAGIC + 0xbabe;
2495 p.blksize = cpu_to_be32(size);
2496
2497 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
Philipp Reisner0b70a132010-08-20 13:36:10 +02002498 (struct p_header80 *)&p, sizeof(p));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002499 return ok;
2500}
2501
2502/* called on sndtimeo
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002503 * returns false if we should retry,
2504 * true if we think connection is dead
Philipp Reisnerb411b362009-09-25 16:07:19 -07002505 */
2506static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2507{
2508 int drop_it;
2509 /* long elapsed = (long)(jiffies - mdev->last_received); */
2510
2511 drop_it = mdev->meta.socket == sock
2512 || !mdev->asender.task
2513 || get_t_state(&mdev->asender) != Running
2514 || mdev->state.conn < C_CONNECTED;
2515
2516 if (drop_it)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002517 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002518
2519 drop_it = !--mdev->ko_count;
2520 if (!drop_it) {
2521 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2522 current->comm, current->pid, mdev->ko_count);
2523 request_ping(mdev);
2524 }
2525
2526 return drop_it; /* && (mdev->state == R_PRIMARY) */;
2527}
2528
2529/* The idea of sendpage seems to be to put some kind of reference
2530 * to the page into the skb, and to hand it over to the NIC. In
2531 * this process get_page() gets called.
2532 *
2533 * As soon as the page was really sent over the network put_page()
2534 * gets called by some part of the network layer. [ NIC driver? ]
2535 *
2536 * [ get_page() / put_page() increment/decrement the count. If count
2537 * reaches 0 the page will be freed. ]
2538 *
2539 * This works nicely with pages from FSs.
2540 * But this means that in protocol A we might signal IO completion too early!
2541 *
2542 * In order not to corrupt data during a resync we must make sure
2543 * that we do not reuse our own buffer pages (EEs) to early, therefore
2544 * we have the net_ee list.
2545 *
2546 * XFS seems to have problems, still, it submits pages with page_count == 0!
2547 * As a workaround, we disable sendpage on pages
2548 * with page_count == 0 or PageSlab.
2549 */
2550static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002551 int offset, size_t size, unsigned msg_flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002552{
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002553 int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002554 kunmap(page);
2555 if (sent == size)
2556 mdev->send_cnt += size>>9;
2557 return sent == size;
2558}
2559
2560static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002561 int offset, size_t size, unsigned msg_flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002562{
2563 mm_segment_t oldfs = get_fs();
2564 int sent, ok;
2565 int len = size;
2566
2567 /* e.g. XFS meta- & log-data is in slab pages, which have a
2568 * page_count of 0 and/or have PageSlab() set.
2569 * we cannot use send_page for those, as that does get_page();
2570 * put_page(); and would cause either a VM_BUG directly, or
2571 * __page_cache_release a page that would actually still be referenced
2572 * by someone, leading to some obscure delayed Oops somewhere else. */
2573 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002574 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002575
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002576 msg_flags |= MSG_NOSIGNAL;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002577 drbd_update_congested(mdev);
2578 set_fs(KERNEL_DS);
2579 do {
2580 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2581 offset, len,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002582 msg_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002583 if (sent == -EAGAIN) {
2584 if (we_should_drop_the_connection(mdev,
2585 mdev->data.socket))
2586 break;
2587 else
2588 continue;
2589 }
2590 if (sent <= 0) {
2591 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2592 __func__, (int)size, len, sent);
2593 break;
2594 }
2595 len -= sent;
2596 offset += sent;
2597 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2598 set_fs(oldfs);
2599 clear_bit(NET_CONGESTED, &mdev->flags);
2600
2601 ok = (len == 0);
2602 if (likely(ok))
2603 mdev->send_cnt += size>>9;
2604 return ok;
2605}
2606
2607static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2608{
2609 struct bio_vec *bvec;
2610 int i;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002611 /* hint all but last page with MSG_MORE */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002612 __bio_for_each_segment(bvec, bio, i, 0) {
2613 if (!_drbd_no_send_page(mdev, bvec->bv_page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002614 bvec->bv_offset, bvec->bv_len,
2615 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002616 return 0;
2617 }
2618 return 1;
2619}
2620
2621static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2622{
2623 struct bio_vec *bvec;
2624 int i;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002625 /* hint all but last page with MSG_MORE */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002626 __bio_for_each_segment(bvec, bio, i, 0) {
2627 if (!_drbd_send_page(mdev, bvec->bv_page,
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002628 bvec->bv_offset, bvec->bv_len,
2629 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002630 return 0;
2631 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002632 return 1;
2633}
2634
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002635static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2636{
2637 struct page *page = e->pages;
2638 unsigned len = e->size;
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002639 /* hint all but last page with MSG_MORE */
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002640 page_chain_for_each(page) {
2641 unsigned l = min_t(unsigned, len, PAGE_SIZE);
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002642 if (!_drbd_send_page(mdev, page, 0, l,
2643 page_chain_next(page) ? MSG_MORE : 0))
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002644 return 0;
2645 len -= l;
2646 }
2647 return 1;
2648}
2649
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002650static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2651{
2652 if (mdev->agreed_pro_version >= 95)
2653 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002654 (bi_rw & REQ_FUA ? DP_FUA : 0) |
2655 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2656 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2657 else
Jens Axboe721a9602011-03-09 11:56:30 +01002658 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002659}
2660
Philipp Reisnerb411b362009-09-25 16:07:19 -07002661/* Used to send write requests
2662 * R_PRIMARY -> Peer (P_DATA)
2663 */
2664int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2665{
2666 int ok = 1;
2667 struct p_data p;
2668 unsigned int dp_flags = 0;
2669 void *dgb;
2670 int dgs;
2671
2672 if (!drbd_get_data_sock(mdev))
2673 return 0;
2674
2675 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2676 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2677
Philipp Reisnerd5373382010-08-23 15:18:33 +02002678 if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
Philipp Reisner0b70a132010-08-20 13:36:10 +02002679 p.head.h80.magic = BE_DRBD_MAGIC;
2680 p.head.h80.command = cpu_to_be16(P_DATA);
2681 p.head.h80.length =
2682 cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2683 } else {
2684 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2685 p.head.h95.command = cpu_to_be16(P_DATA);
2686 p.head.h95.length =
2687 cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2688 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002689
2690 p.sector = cpu_to_be64(req->sector);
2691 p.block_id = (unsigned long)req;
2692 p.seq_num = cpu_to_be32(req->seq_num =
2693 atomic_add_return(1, &mdev->packet_seq));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002694
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02002695 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2696
Philipp Reisnerb411b362009-09-25 16:07:19 -07002697 if (mdev->state.conn >= C_SYNC_SOURCE &&
2698 mdev->state.conn <= C_PAUSED_SYNC_T)
2699 dp_flags |= DP_MAY_SET_IN_SYNC;
2700
2701 p.dp_flags = cpu_to_be32(dp_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002702 set_bit(UNPLUG_REMOTE, &mdev->flags);
2703 ok = (sizeof(p) ==
Lars Ellenbergba11ad92010-05-25 16:26:16 +02002704 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002705 if (ok && dgs) {
2706 dgb = mdev->int_dig_out;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002707 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
Andreas Gruenbachercab2f742010-12-09 16:08:46 +01002708 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002709 }
2710 if (ok) {
Lars Ellenberg470be442010-11-10 10:36:52 +01002711 /* For protocol A, we have to memcpy the payload into
2712 * socket buffers, as we may complete right away
2713 * as soon as we handed it over to tcp, at which point the data
2714 * pages may become invalid.
2715 *
2716 * For data-integrity enabled, we copy it as well, so we can be
2717 * sure that even if the bio pages may still be modified, it
2718 * won't change the data on the wire, thus if the digest checks
2719 * out ok after sending on this side, but does not fit on the
2720 * receiving side, we sure have detected corruption elsewhere.
2721 */
2722 if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002723 ok = _drbd_send_bio(mdev, req->master_bio);
2724 else
2725 ok = _drbd_send_zc_bio(mdev, req->master_bio);
Lars Ellenberg470be442010-11-10 10:36:52 +01002726
2727 /* double check digest, sometimes buffers have been modified in flight. */
2728 if (dgs > 0 && dgs <= 64) {
2729 /* 64 byte, 512 bit, is the larges digest size
2730 * currently supported in kernel crypto. */
2731 unsigned char digest[64];
2732 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
2733 if (memcmp(mdev->int_dig_out, digest, dgs)) {
2734 dev_warn(DEV,
2735 "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
2736 (unsigned long long)req->sector, req->size);
2737 }
2738 } /* else if (dgs > 64) {
2739 ... Be noisy about digest too large ...
2740 } */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002741 }
2742
2743 drbd_put_data_sock(mdev);
Philipp Reisnerbd26bfc2010-05-04 12:33:58 +02002744
Philipp Reisnerb411b362009-09-25 16:07:19 -07002745 return ok;
2746}
2747
2748/* answer packet, used to send data back for read requests:
2749 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
2750 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
2751 */
2752int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2753 struct drbd_epoch_entry *e)
2754{
2755 int ok;
2756 struct p_data p;
2757 void *dgb;
2758 int dgs;
2759
2760 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2761 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2762
Philipp Reisnerd5373382010-08-23 15:18:33 +02002763 if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
Philipp Reisner0b70a132010-08-20 13:36:10 +02002764 p.head.h80.magic = BE_DRBD_MAGIC;
2765 p.head.h80.command = cpu_to_be16(cmd);
2766 p.head.h80.length =
2767 cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2768 } else {
2769 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2770 p.head.h95.command = cpu_to_be16(cmd);
2771 p.head.h95.length =
2772 cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2773 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002774
2775 p.sector = cpu_to_be64(e->sector);
2776 p.block_id = e->block_id;
2777 /* p.seq_num = 0; No sequence numbers here.. */
2778
2779 /* Only called by our kernel thread.
2780 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2781 * in response to admin command or module unload.
2782 */
2783 if (!drbd_get_data_sock(mdev))
2784 return 0;
2785
Philipp Reisner0b70a132010-08-20 13:36:10 +02002786 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002787 if (ok && dgs) {
2788 dgb = mdev->int_dig_out;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002789 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
Andreas Gruenbachercab2f742010-12-09 16:08:46 +01002790 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002791 }
2792 if (ok)
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002793 ok = _drbd_send_zc_ee(mdev, e);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002794
2795 drbd_put_data_sock(mdev);
Philipp Reisnerbd26bfc2010-05-04 12:33:58 +02002796
Philipp Reisnerb411b362009-09-25 16:07:19 -07002797 return ok;
2798}
2799
Philipp Reisner73a01a12010-10-27 14:33:00 +02002800int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
2801{
2802 struct p_block_desc p;
2803
2804 p.sector = cpu_to_be64(req->sector);
2805 p.blksize = cpu_to_be32(req->size);
2806
2807 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
2808}
2809
Philipp Reisnerb411b362009-09-25 16:07:19 -07002810/*
2811 drbd_send distinguishes two cases:
2812
2813 Packets sent via the data socket "sock"
2814 and packets sent via the meta data socket "msock"
2815
2816 sock msock
2817 -----------------+-------------------------+------------------------------
2818 timeout conf.timeout / 2 conf.timeout / 2
2819 timeout action send a ping via msock Abort communication
2820 and close all sockets
2821*/
2822
2823/*
2824 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2825 */
2826int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2827 void *buf, size_t size, unsigned msg_flags)
2828{
2829 struct kvec iov;
2830 struct msghdr msg;
2831 int rv, sent = 0;
2832
2833 if (!sock)
2834 return -1000;
2835
2836 /* THINK if (signal_pending) return ... ? */
2837
2838 iov.iov_base = buf;
2839 iov.iov_len = size;
2840
2841 msg.msg_name = NULL;
2842 msg.msg_namelen = 0;
2843 msg.msg_control = NULL;
2844 msg.msg_controllen = 0;
2845 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
2846
2847 if (sock == mdev->data.socket) {
2848 mdev->ko_count = mdev->net_conf->ko_count;
2849 drbd_update_congested(mdev);
2850 }
2851 do {
2852 /* STRANGE
2853 * tcp_sendmsg does _not_ use its size parameter at all ?
2854 *
2855 * -EAGAIN on timeout, -EINTR on signal.
2856 */
2857/* THINK
2858 * do we need to block DRBD_SIG if sock == &meta.socket ??
2859 * otherwise wake_asender() might interrupt some send_*Ack !
2860 */
2861 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2862 if (rv == -EAGAIN) {
2863 if (we_should_drop_the_connection(mdev, sock))
2864 break;
2865 else
2866 continue;
2867 }
2868 D_ASSERT(rv != 0);
2869 if (rv == -EINTR) {
2870 flush_signals(current);
2871 rv = 0;
2872 }
2873 if (rv < 0)
2874 break;
2875 sent += rv;
2876 iov.iov_base += rv;
2877 iov.iov_len -= rv;
2878 } while (sent < size);
2879
2880 if (sock == mdev->data.socket)
2881 clear_bit(NET_CONGESTED, &mdev->flags);
2882
2883 if (rv <= 0) {
2884 if (rv != -EAGAIN) {
2885 dev_err(DEV, "%s_sendmsg returned %d\n",
2886 sock == mdev->meta.socket ? "msock" : "sock",
2887 rv);
2888 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2889 } else
2890 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2891 }
2892
2893 return sent;
2894}
2895
2896static int drbd_open(struct block_device *bdev, fmode_t mode)
2897{
2898 struct drbd_conf *mdev = bdev->bd_disk->private_data;
2899 unsigned long flags;
2900 int rv = 0;
2901
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002902 mutex_lock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002903 spin_lock_irqsave(&mdev->req_lock, flags);
2904 /* to have a stable mdev->state.role
2905 * and no race with updating open_cnt */
2906
2907 if (mdev->state.role != R_PRIMARY) {
2908 if (mode & FMODE_WRITE)
2909 rv = -EROFS;
2910 else if (!allow_oos)
2911 rv = -EMEDIUMTYPE;
2912 }
2913
2914 if (!rv)
2915 mdev->open_cnt++;
2916 spin_unlock_irqrestore(&mdev->req_lock, flags);
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002917 mutex_unlock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002918
2919 return rv;
2920}
2921
2922static int drbd_release(struct gendisk *gd, fmode_t mode)
2923{
2924 struct drbd_conf *mdev = gd->private_data;
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002925 mutex_lock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002926 mdev->open_cnt--;
Arnd Bergmann2a48fc02010-06-02 14:28:52 +02002927 mutex_unlock(&drbd_main_mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002928 return 0;
2929}
2930
Philipp Reisnerb411b362009-09-25 16:07:19 -07002931static void drbd_set_defaults(struct drbd_conf *mdev)
2932{
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002933 /* This way we get a compile error when sync_conf grows,
2934 and we forgot to initialize it here */
2935 mdev->sync_conf = (struct syncer_conf) {
2936 /* .rate = */ DRBD_RATE_DEF,
2937 /* .after = */ DRBD_AFTER_DEF,
2938 /* .al_extents = */ DRBD_AL_EXTENTS_DEF,
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002939 /* .verify_alg = */ {}, 0,
2940 /* .cpu_mask = */ {}, 0,
2941 /* .csums_alg = */ {}, 0,
Philipp Reisnere7564142010-06-29 17:35:34 +02002942 /* .use_rle = */ 0,
Philipp Reisner9a31d712010-07-05 13:42:03 +02002943 /* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
2944 /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
2945 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
2946 /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02002947 /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
2948 /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
Philipp Reisner85f4cc12010-06-29 17:35:34 +02002949 };
2950
2951 /* Have to use that way, because the layout differs between
2952 big endian and little endian */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002953 mdev->state = (union drbd_state) {
2954 { .role = R_SECONDARY,
2955 .peer = R_UNKNOWN,
2956 .conn = C_STANDALONE,
2957 .disk = D_DISKLESS,
2958 .pdsk = D_UNKNOWN,
Philipp Reisnerfb22c402010-09-08 23:20:21 +02002959 .susp = 0,
2960 .susp_nod = 0,
2961 .susp_fen = 0
Philipp Reisnerb411b362009-09-25 16:07:19 -07002962 } };
2963}
2964
2965void drbd_init_set_defaults(struct drbd_conf *mdev)
2966{
2967 /* the memset(,0,) did most of this.
2968 * note: only assignments, no allocation in here */
2969
2970 drbd_set_defaults(mdev);
2971
Philipp Reisnerb411b362009-09-25 16:07:19 -07002972 atomic_set(&mdev->ap_bio_cnt, 0);
2973 atomic_set(&mdev->ap_pending_cnt, 0);
2974 atomic_set(&mdev->rs_pending_cnt, 0);
2975 atomic_set(&mdev->unacked_cnt, 0);
2976 atomic_set(&mdev->local_cnt, 0);
2977 atomic_set(&mdev->net_cnt, 0);
2978 atomic_set(&mdev->packet_seq, 0);
2979 atomic_set(&mdev->pp_in_use, 0);
Lars Ellenberg435f0742010-09-06 12:30:25 +02002980 atomic_set(&mdev->pp_in_use_by_net, 0);
Philipp Reisner778f2712010-07-06 11:14:00 +02002981 atomic_set(&mdev->rs_sect_in, 0);
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02002982 atomic_set(&mdev->rs_sect_ev, 0);
Philipp Reisner759fbdf2010-10-26 16:02:27 +02002983 atomic_set(&mdev->ap_in_flight, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002984
2985 mutex_init(&mdev->md_io_mutex);
2986 mutex_init(&mdev->data.mutex);
2987 mutex_init(&mdev->meta.mutex);
2988 sema_init(&mdev->data.work.s, 0);
2989 sema_init(&mdev->meta.work.s, 0);
2990 mutex_init(&mdev->state_mutex);
2991
2992 spin_lock_init(&mdev->data.work.q_lock);
2993 spin_lock_init(&mdev->meta.work.q_lock);
2994
2995 spin_lock_init(&mdev->al_lock);
2996 spin_lock_init(&mdev->req_lock);
2997 spin_lock_init(&mdev->peer_seq_lock);
2998 spin_lock_init(&mdev->epoch_lock);
2999
3000 INIT_LIST_HEAD(&mdev->active_ee);
3001 INIT_LIST_HEAD(&mdev->sync_ee);
3002 INIT_LIST_HEAD(&mdev->done_ee);
3003 INIT_LIST_HEAD(&mdev->read_ee);
3004 INIT_LIST_HEAD(&mdev->net_ee);
3005 INIT_LIST_HEAD(&mdev->resync_reads);
3006 INIT_LIST_HEAD(&mdev->data.work.q);
3007 INIT_LIST_HEAD(&mdev->meta.work.q);
3008 INIT_LIST_HEAD(&mdev->resync_work.list);
3009 INIT_LIST_HEAD(&mdev->unplug_work.list);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003010 INIT_LIST_HEAD(&mdev->go_diskless.list);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003011 INIT_LIST_HEAD(&mdev->md_sync_work.list);
Philipp Reisnerc4752ef2010-10-27 17:32:36 +02003012 INIT_LIST_HEAD(&mdev->start_resync_work.list);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003013 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
Philipp Reisner0ced55a2010-04-30 15:26:20 +02003014
Philipp Reisner794abb72010-12-27 11:51:23 +01003015 mdev->resync_work.cb = w_resync_timer;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003016 mdev->unplug_work.cb = w_send_write_hint;
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003017 mdev->go_diskless.cb = w_go_diskless;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003018 mdev->md_sync_work.cb = w_md_sync;
3019 mdev->bm_io_work.w.cb = w_bitmap_io;
Philipp Reisner370a43e2011-01-14 16:03:11 +01003020 mdev->start_resync_work.cb = w_start_resync;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003021 init_timer(&mdev->resync_timer);
3022 init_timer(&mdev->md_sync_timer);
Philipp Reisner370a43e2011-01-14 16:03:11 +01003023 init_timer(&mdev->start_resync_timer);
Philipp Reisner7fde2be2011-03-01 11:08:28 +01003024 init_timer(&mdev->request_timer);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003025 mdev->resync_timer.function = resync_timer_fn;
3026 mdev->resync_timer.data = (unsigned long) mdev;
3027 mdev->md_sync_timer.function = md_sync_timer_fn;
3028 mdev->md_sync_timer.data = (unsigned long) mdev;
Philipp Reisner370a43e2011-01-14 16:03:11 +01003029 mdev->start_resync_timer.function = start_resync_timer_fn;
3030 mdev->start_resync_timer.data = (unsigned long) mdev;
Philipp Reisner7fde2be2011-03-01 11:08:28 +01003031 mdev->request_timer.function = request_timer_fn;
3032 mdev->request_timer.data = (unsigned long) mdev;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003033
3034 init_waitqueue_head(&mdev->misc_wait);
3035 init_waitqueue_head(&mdev->state_wait);
Philipp Reisner84dfb9f2010-06-23 11:20:05 +02003036 init_waitqueue_head(&mdev->net_cnt_wait);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003037 init_waitqueue_head(&mdev->ee_wait);
3038 init_waitqueue_head(&mdev->al_wait);
3039 init_waitqueue_head(&mdev->seq_wait);
3040
3041 drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
3042 drbd_thread_init(mdev, &mdev->worker, drbd_worker);
3043 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
3044
3045 mdev->agreed_pro_version = PRO_VERSION_MAX;
Philipp Reisner2451fc32010-08-24 13:43:11 +02003046 mdev->write_ordering = WO_bdev_flush;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003047 mdev->resync_wenr = LC_FREE;
3048}
3049
3050void drbd_mdev_cleanup(struct drbd_conf *mdev)
3051{
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02003052 int i;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003053 if (mdev->receiver.t_state != None)
3054 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
3055 mdev->receiver.t_state);
3056
3057 /* no need to lock it, I'm the only thread alive */
3058 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
3059 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
3060 mdev->al_writ_cnt =
3061 mdev->bm_writ_cnt =
3062 mdev->read_cnt =
3063 mdev->recv_cnt =
3064 mdev->send_cnt =
3065 mdev->writ_cnt =
3066 mdev->p_size =
3067 mdev->rs_start =
3068 mdev->rs_total =
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02003069 mdev->rs_failed = 0;
3070 mdev->rs_last_events = 0;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02003071 mdev->rs_last_sect_ev = 0;
Lars Ellenberg1d7734a2010-08-11 21:21:50 +02003072 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
3073 mdev->rs_mark_left[i] = 0;
3074 mdev->rs_mark_time[i] = 0;
3075 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07003076 D_ASSERT(mdev->net_conf == NULL);
3077
3078 drbd_set_my_capacity(mdev, 0);
3079 if (mdev->bitmap) {
3080 /* maybe never allocated. */
Philipp Reisner02d9a942010-03-24 16:23:03 +01003081 drbd_bm_resize(mdev, 0, 1);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003082 drbd_bm_cleanup(mdev);
3083 }
3084
3085 drbd_free_resources(mdev);
Philipp Reisner07782862010-08-31 12:00:50 +02003086 clear_bit(AL_SUSPENDED, &mdev->flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003087
3088 /*
3089 * currently we drbd_init_ee only on module load, so
3090 * we may do drbd_release_ee only on module unload!
3091 */
3092 D_ASSERT(list_empty(&mdev->active_ee));
3093 D_ASSERT(list_empty(&mdev->sync_ee));
3094 D_ASSERT(list_empty(&mdev->done_ee));
3095 D_ASSERT(list_empty(&mdev->read_ee));
3096 D_ASSERT(list_empty(&mdev->net_ee));
3097 D_ASSERT(list_empty(&mdev->resync_reads));
3098 D_ASSERT(list_empty(&mdev->data.work.q));
3099 D_ASSERT(list_empty(&mdev->meta.work.q));
3100 D_ASSERT(list_empty(&mdev->resync_work.list));
3101 D_ASSERT(list_empty(&mdev->unplug_work.list));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003102 D_ASSERT(list_empty(&mdev->go_diskless.list));
Lars Ellenberg2265b472010-12-16 15:41:26 +01003103
3104 drbd_set_defaults(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003105}
3106
3107
3108static void drbd_destroy_mempools(void)
3109{
3110 struct page *page;
3111
3112 while (drbd_pp_pool) {
3113 page = drbd_pp_pool;
3114 drbd_pp_pool = (struct page *)page_private(page);
3115 __free_page(page);
3116 drbd_pp_vacant--;
3117 }
3118
3119 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
3120
3121 if (drbd_ee_mempool)
3122 mempool_destroy(drbd_ee_mempool);
3123 if (drbd_request_mempool)
3124 mempool_destroy(drbd_request_mempool);
3125 if (drbd_ee_cache)
3126 kmem_cache_destroy(drbd_ee_cache);
3127 if (drbd_request_cache)
3128 kmem_cache_destroy(drbd_request_cache);
3129 if (drbd_bm_ext_cache)
3130 kmem_cache_destroy(drbd_bm_ext_cache);
3131 if (drbd_al_ext_cache)
3132 kmem_cache_destroy(drbd_al_ext_cache);
3133
3134 drbd_ee_mempool = NULL;
3135 drbd_request_mempool = NULL;
3136 drbd_ee_cache = NULL;
3137 drbd_request_cache = NULL;
3138 drbd_bm_ext_cache = NULL;
3139 drbd_al_ext_cache = NULL;
3140
3141 return;
3142}
3143
3144static int drbd_create_mempools(void)
3145{
3146 struct page *page;
Lars Ellenberg1816a2b2010-11-11 15:19:07 +01003147 const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003148 int i;
3149
3150 /* prepare our caches and mempools */
3151 drbd_request_mempool = NULL;
3152 drbd_ee_cache = NULL;
3153 drbd_request_cache = NULL;
3154 drbd_bm_ext_cache = NULL;
3155 drbd_al_ext_cache = NULL;
3156 drbd_pp_pool = NULL;
3157
3158 /* caches */
3159 drbd_request_cache = kmem_cache_create(
3160 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
3161 if (drbd_request_cache == NULL)
3162 goto Enomem;
3163
3164 drbd_ee_cache = kmem_cache_create(
3165 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
3166 if (drbd_ee_cache == NULL)
3167 goto Enomem;
3168
3169 drbd_bm_ext_cache = kmem_cache_create(
3170 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
3171 if (drbd_bm_ext_cache == NULL)
3172 goto Enomem;
3173
3174 drbd_al_ext_cache = kmem_cache_create(
3175 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
3176 if (drbd_al_ext_cache == NULL)
3177 goto Enomem;
3178
3179 /* mempools */
3180 drbd_request_mempool = mempool_create(number,
3181 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
3182 if (drbd_request_mempool == NULL)
3183 goto Enomem;
3184
3185 drbd_ee_mempool = mempool_create(number,
3186 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
Nicolas Kaiser2027ae12010-10-28 06:15:26 -06003187 if (drbd_ee_mempool == NULL)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003188 goto Enomem;
3189
3190 /* drbd's page pool */
3191 spin_lock_init(&drbd_pp_lock);
3192
3193 for (i = 0; i < number; i++) {
3194 page = alloc_page(GFP_HIGHUSER);
3195 if (!page)
3196 goto Enomem;
3197 set_page_private(page, (unsigned long)drbd_pp_pool);
3198 drbd_pp_pool = page;
3199 }
3200 drbd_pp_vacant = number;
3201
3202 return 0;
3203
3204Enomem:
3205 drbd_destroy_mempools(); /* in case we allocated some */
3206 return -ENOMEM;
3207}
3208
3209static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
3210 void *unused)
3211{
3212 /* just so we have it. you never know what interesting things we
3213 * might want to do here some day...
3214 */
3215
3216 return NOTIFY_DONE;
3217}
3218
3219static struct notifier_block drbd_notifier = {
3220 .notifier_call = drbd_notify_sys,
3221};
3222
3223static void drbd_release_ee_lists(struct drbd_conf *mdev)
3224{
3225 int rr;
3226
3227 rr = drbd_release_ee(mdev, &mdev->active_ee);
3228 if (rr)
3229 dev_err(DEV, "%d EEs in active list found!\n", rr);
3230
3231 rr = drbd_release_ee(mdev, &mdev->sync_ee);
3232 if (rr)
3233 dev_err(DEV, "%d EEs in sync list found!\n", rr);
3234
3235 rr = drbd_release_ee(mdev, &mdev->read_ee);
3236 if (rr)
3237 dev_err(DEV, "%d EEs in read list found!\n", rr);
3238
3239 rr = drbd_release_ee(mdev, &mdev->done_ee);
3240 if (rr)
3241 dev_err(DEV, "%d EEs in done list found!\n", rr);
3242
3243 rr = drbd_release_ee(mdev, &mdev->net_ee);
3244 if (rr)
3245 dev_err(DEV, "%d EEs in net list found!\n", rr);
3246}
3247
3248/* caution. no locking.
3249 * currently only used from module cleanup code. */
3250static void drbd_delete_device(unsigned int minor)
3251{
3252 struct drbd_conf *mdev = minor_to_mdev(minor);
3253
3254 if (!mdev)
3255 return;
3256
3257 /* paranoia asserts */
3258 if (mdev->open_cnt != 0)
3259 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
3260 __FILE__ , __LINE__);
3261
3262 ERR_IF (!list_empty(&mdev->data.work.q)) {
3263 struct list_head *lp;
3264 list_for_each(lp, &mdev->data.work.q) {
3265 dev_err(DEV, "lp = %p\n", lp);
3266 }
3267 };
3268 /* end paranoia asserts */
3269
3270 del_gendisk(mdev->vdisk);
3271
3272 /* cleanup stuff that may have been allocated during
3273 * device (re-)configuration or state changes */
3274
3275 if (mdev->this_bdev)
3276 bdput(mdev->this_bdev);
3277
3278 drbd_free_resources(mdev);
3279
3280 drbd_release_ee_lists(mdev);
3281
3282 /* should be free'd on disconnect? */
3283 kfree(mdev->ee_hash);
3284 /*
3285 mdev->ee_hash_s = 0;
3286 mdev->ee_hash = NULL;
3287 */
3288
3289 lc_destroy(mdev->act_log);
3290 lc_destroy(mdev->resync);
3291
3292 kfree(mdev->p_uuid);
3293 /* mdev->p_uuid = NULL; */
3294
3295 kfree(mdev->int_dig_out);
3296 kfree(mdev->int_dig_in);
3297 kfree(mdev->int_dig_vv);
3298
3299 /* cleanup the rest that has been
3300 * allocated from drbd_new_device
3301 * and actually free the mdev itself */
3302 drbd_free_mdev(mdev);
3303}
3304
3305static void drbd_cleanup(void)
3306{
3307 unsigned int i;
3308
3309 unregister_reboot_notifier(&drbd_notifier);
3310
Lars Ellenberg17a93f32010-11-24 10:37:35 +01003311 /* first remove proc,
3312 * drbdsetup uses it's presence to detect
3313 * whether DRBD is loaded.
3314 * If we would get stuck in proc removal,
3315 * but have netlink already deregistered,
3316 * some drbdsetup commands may wait forever
3317 * for an answer.
3318 */
3319 if (drbd_proc)
3320 remove_proc_entry("drbd", NULL);
3321
Philipp Reisnerb411b362009-09-25 16:07:19 -07003322 drbd_nl_cleanup();
3323
3324 if (minor_table) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003325 i = minor_count;
3326 while (i--)
3327 drbd_delete_device(i);
3328 drbd_destroy_mempools();
3329 }
3330
3331 kfree(minor_table);
3332
3333 unregister_blkdev(DRBD_MAJOR, "drbd");
3334
3335 printk(KERN_INFO "drbd: module cleanup done.\n");
3336}
3337
3338/**
3339 * drbd_congested() - Callback for pdflush
3340 * @congested_data: User data
3341 * @bdi_bits: Bits pdflush is currently interested in
3342 *
3343 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3344 */
3345static int drbd_congested(void *congested_data, int bdi_bits)
3346{
3347 struct drbd_conf *mdev = congested_data;
3348 struct request_queue *q;
3349 char reason = '-';
3350 int r = 0;
3351
Andreas Gruenbacher1b881ef2010-12-13 18:03:38 +01003352 if (!may_inc_ap_bio(mdev)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003353 /* DRBD has frozen IO */
3354 r = bdi_bits;
3355 reason = 'd';
3356 goto out;
3357 }
3358
3359 if (get_ldev(mdev)) {
3360 q = bdev_get_queue(mdev->ldev->backing_bdev);
3361 r = bdi_congested(&q->backing_dev_info, bdi_bits);
3362 put_ldev(mdev);
3363 if (r)
3364 reason = 'b';
3365 }
3366
3367 if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3368 r |= (1 << BDI_async_congested);
3369 reason = reason == 'b' ? 'a' : 'n';
3370 }
3371
3372out:
3373 mdev->congestion_reason = reason;
3374 return r;
3375}
3376
3377struct drbd_conf *drbd_new_device(unsigned int minor)
3378{
3379 struct drbd_conf *mdev;
3380 struct gendisk *disk;
3381 struct request_queue *q;
3382
3383 /* GFP_KERNEL, we are outside of all write-out paths */
3384 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3385 if (!mdev)
3386 return NULL;
3387 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3388 goto out_no_cpumask;
3389
3390 mdev->minor = minor;
3391
3392 drbd_init_set_defaults(mdev);
3393
3394 q = blk_alloc_queue(GFP_KERNEL);
3395 if (!q)
3396 goto out_no_q;
3397 mdev->rq_queue = q;
3398 q->queuedata = mdev;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003399
3400 disk = alloc_disk(1);
3401 if (!disk)
3402 goto out_no_disk;
3403 mdev->vdisk = disk;
3404
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003405 set_disk_ro(disk, true);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003406
3407 disk->queue = q;
3408 disk->major = DRBD_MAJOR;
3409 disk->first_minor = minor;
3410 disk->fops = &drbd_ops;
3411 sprintf(disk->disk_name, "drbd%d", minor);
3412 disk->private_data = mdev;
3413
3414 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3415 /* we have no partitions. we contain only ourselves. */
3416 mdev->this_bdev->bd_contains = mdev->this_bdev;
3417
3418 q->backing_dev_info.congested_fn = drbd_congested;
3419 q->backing_dev_info.congested_data = mdev;
3420
Andreas Gruenbacher2f58dcf2010-12-13 17:48:19 +01003421 blk_queue_make_request(q, drbd_make_request);
Lars Ellenberg1816a2b2010-11-11 15:19:07 +01003422 blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE >> 9);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003423 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3424 blk_queue_merge_bvec(q, drbd_merge_bvec);
Jens Axboe7eaceac2011-03-10 08:52:07 +01003425 q->queue_lock = &mdev->req_lock;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003426
3427 mdev->md_io_page = alloc_page(GFP_KERNEL);
3428 if (!mdev->md_io_page)
3429 goto out_no_io_page;
3430
3431 if (drbd_bm_init(mdev))
3432 goto out_no_bitmap;
3433 /* no need to lock access, we are still initializing this minor device. */
3434 if (!tl_init(mdev))
3435 goto out_no_tl;
3436
3437 mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3438 if (!mdev->app_reads_hash)
3439 goto out_no_app_reads;
3440
3441 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3442 if (!mdev->current_epoch)
3443 goto out_no_epoch;
3444
3445 INIT_LIST_HEAD(&mdev->current_epoch->list);
3446 mdev->epochs = 1;
3447
3448 return mdev;
3449
3450/* out_whatever_else:
3451 kfree(mdev->current_epoch); */
3452out_no_epoch:
3453 kfree(mdev->app_reads_hash);
3454out_no_app_reads:
3455 tl_cleanup(mdev);
3456out_no_tl:
3457 drbd_bm_cleanup(mdev);
3458out_no_bitmap:
3459 __free_page(mdev->md_io_page);
3460out_no_io_page:
3461 put_disk(disk);
3462out_no_disk:
3463 blk_cleanup_queue(q);
3464out_no_q:
3465 free_cpumask_var(mdev->cpu_mask);
3466out_no_cpumask:
3467 kfree(mdev);
3468 return NULL;
3469}
3470
3471/* counterpart of drbd_new_device.
3472 * last part of drbd_delete_device. */
3473void drbd_free_mdev(struct drbd_conf *mdev)
3474{
3475 kfree(mdev->current_epoch);
3476 kfree(mdev->app_reads_hash);
3477 tl_cleanup(mdev);
3478 if (mdev->bitmap) /* should no longer be there. */
3479 drbd_bm_cleanup(mdev);
3480 __free_page(mdev->md_io_page);
3481 put_disk(mdev->vdisk);
3482 blk_cleanup_queue(mdev->rq_queue);
3483 free_cpumask_var(mdev->cpu_mask);
Philipp Reisner37190942010-11-10 12:08:37 +01003484 drbd_free_tl_hash(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003485 kfree(mdev);
3486}
3487
3488
3489int __init drbd_init(void)
3490{
3491 int err;
3492
3493 if (sizeof(struct p_handshake) != 80) {
3494 printk(KERN_ERR
3495 "drbd: never change the size or layout "
3496 "of the HandShake packet.\n");
3497 return -EINVAL;
3498 }
3499
Philipp Reisner2b8a90b2011-01-10 11:15:17 +01003500 if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003501 printk(KERN_ERR
3502 "drbd: invalid minor_count (%d)\n", minor_count);
3503#ifdef MODULE
3504 return -EINVAL;
3505#else
3506 minor_count = 8;
3507#endif
3508 }
3509
3510 err = drbd_nl_init();
3511 if (err)
3512 return err;
3513
3514 err = register_blkdev(DRBD_MAJOR, "drbd");
3515 if (err) {
3516 printk(KERN_ERR
3517 "drbd: unable to register block device major %d\n",
3518 DRBD_MAJOR);
3519 return err;
3520 }
3521
3522 register_reboot_notifier(&drbd_notifier);
3523
3524 /*
3525 * allocate all necessary structs
3526 */
3527 err = -ENOMEM;
3528
3529 init_waitqueue_head(&drbd_pp_wait);
3530
3531 drbd_proc = NULL; /* play safe for drbd_cleanup */
3532 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3533 GFP_KERNEL);
3534 if (!minor_table)
3535 goto Enomem;
3536
3537 err = drbd_create_mempools();
3538 if (err)
3539 goto Enomem;
3540
Lars Ellenberg8c484ee2010-03-11 16:47:58 +01003541 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003542 if (!drbd_proc) {
3543 printk(KERN_ERR "drbd: unable to register proc file\n");
3544 goto Enomem;
3545 }
3546
3547 rwlock_init(&global_state_lock);
3548
3549 printk(KERN_INFO "drbd: initialized. "
3550 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3551 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3552 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3553 printk(KERN_INFO "drbd: registered as block device major %d\n",
3554 DRBD_MAJOR);
3555 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3556
3557 return 0; /* Success! */
3558
3559Enomem:
3560 drbd_cleanup();
3561 if (err == -ENOMEM)
3562 /* currently always the case */
3563 printk(KERN_ERR "drbd: ran out of memory\n");
3564 else
3565 printk(KERN_ERR "drbd: initialization failure\n");
3566 return err;
3567}
3568
3569void drbd_free_bc(struct drbd_backing_dev *ldev)
3570{
3571 if (ldev == NULL)
3572 return;
3573
Tejun Heoe525fd82010-11-13 11:55:17 +01003574 blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3575 blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003576
3577 kfree(ldev);
3578}
3579
3580void drbd_free_sock(struct drbd_conf *mdev)
3581{
3582 if (mdev->data.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003583 mutex_lock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003584 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3585 sock_release(mdev->data.socket);
3586 mdev->data.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003587 mutex_unlock(&mdev->data.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003588 }
3589 if (mdev->meta.socket) {
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003590 mutex_lock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003591 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3592 sock_release(mdev->meta.socket);
3593 mdev->meta.socket = NULL;
Lars Ellenberg4589d7f2010-03-03 02:25:33 +01003594 mutex_unlock(&mdev->meta.mutex);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003595 }
3596}
3597
3598
3599void drbd_free_resources(struct drbd_conf *mdev)
3600{
3601 crypto_free_hash(mdev->csums_tfm);
3602 mdev->csums_tfm = NULL;
3603 crypto_free_hash(mdev->verify_tfm);
3604 mdev->verify_tfm = NULL;
3605 crypto_free_hash(mdev->cram_hmac_tfm);
3606 mdev->cram_hmac_tfm = NULL;
3607 crypto_free_hash(mdev->integrity_w_tfm);
3608 mdev->integrity_w_tfm = NULL;
3609 crypto_free_hash(mdev->integrity_r_tfm);
3610 mdev->integrity_r_tfm = NULL;
3611
3612 drbd_free_sock(mdev);
3613
3614 __no_warn(local,
3615 drbd_free_bc(mdev->ldev);
3616 mdev->ldev = NULL;);
3617}
3618
3619/* meta data management */
3620
3621struct meta_data_on_disk {
3622 u64 la_size; /* last agreed size. */
3623 u64 uuid[UI_SIZE]; /* UUIDs. */
3624 u64 device_uuid;
3625 u64 reserved_u64_1;
3626 u32 flags; /* MDF */
3627 u32 magic;
3628 u32 md_size_sect;
3629 u32 al_offset; /* offset to this block */
3630 u32 al_nr_extents; /* important for restoring the AL */
3631 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3632 u32 bm_offset; /* offset to the bitmap, from here */
3633 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
3634 u32 reserved_u32[4];
3635
3636} __packed;
3637
3638/**
3639 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3640 * @mdev: DRBD device.
3641 */
3642void drbd_md_sync(struct drbd_conf *mdev)
3643{
3644 struct meta_data_on_disk *buffer;
3645 sector_t sector;
3646 int i;
3647
Lars Ellenbergee15b032010-09-03 10:00:09 +02003648 del_timer(&mdev->md_sync_timer);
3649 /* timer may be rearmed by drbd_md_mark_dirty() now. */
Philipp Reisnerb411b362009-09-25 16:07:19 -07003650 if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3651 return;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003652
3653 /* We use here D_FAILED and not D_ATTACHING because we try to write
3654 * metadata even if we detach due to a disk failure! */
3655 if (!get_ldev_if_state(mdev, D_FAILED))
3656 return;
3657
Philipp Reisnerb411b362009-09-25 16:07:19 -07003658 mutex_lock(&mdev->md_io_mutex);
3659 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3660 memset(buffer, 0, 512);
3661
3662 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3663 for (i = UI_CURRENT; i < UI_SIZE; i++)
3664 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3665 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3666 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3667
3668 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
3669 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
3670 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3671 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3672 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3673
3674 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
3675
3676 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3677 sector = mdev->ldev->md.md_offset;
3678
Lars Ellenberg3f3a9b82010-09-01 15:12:12 +02003679 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003680 /* this was a try anyways ... */
3681 dev_err(DEV, "meta data update failed!\n");
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003682 drbd_chk_io_error(mdev, 1, true);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003683 }
3684
3685 /* Update mdev->ldev->md.la_size_sect,
3686 * since we updated it on metadata. */
3687 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3688
3689 mutex_unlock(&mdev->md_io_mutex);
3690 put_ldev(mdev);
3691}
3692
3693/**
3694 * drbd_md_read() - Reads in the meta data super block
3695 * @mdev: DRBD device.
3696 * @bdev: Device from which the meta data should be read in.
3697 *
Andreas Gruenbacher116676c2010-12-08 13:33:11 +01003698 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
Philipp Reisnerb411b362009-09-25 16:07:19 -07003699 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3700 */
3701int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3702{
3703 struct meta_data_on_disk *buffer;
3704 int i, rv = NO_ERROR;
3705
3706 if (!get_ldev_if_state(mdev, D_ATTACHING))
3707 return ERR_IO_MD_DISK;
3708
Philipp Reisnerb411b362009-09-25 16:07:19 -07003709 mutex_lock(&mdev->md_io_mutex);
3710 buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3711
3712 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
Lucas De Marchi25985ed2011-03-30 22:57:33 -03003713 /* NOTE: can't do normal error processing here as this is
Philipp Reisnerb411b362009-09-25 16:07:19 -07003714 called BEFORE disk is attached */
3715 dev_err(DEV, "Error while reading metadata.\n");
3716 rv = ERR_IO_MD_DISK;
3717 goto err;
3718 }
3719
3720 if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3721 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3722 rv = ERR_MD_INVALID;
3723 goto err;
3724 }
3725 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3726 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3727 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3728 rv = ERR_MD_INVALID;
3729 goto err;
3730 }
3731 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3732 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3733 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3734 rv = ERR_MD_INVALID;
3735 goto err;
3736 }
3737 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3738 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3739 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3740 rv = ERR_MD_INVALID;
3741 goto err;
3742 }
3743
3744 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3745 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3746 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3747 rv = ERR_MD_INVALID;
3748 goto err;
3749 }
3750
3751 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3752 for (i = UI_CURRENT; i < UI_SIZE; i++)
3753 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3754 bdev->md.flags = be32_to_cpu(buffer->flags);
3755 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3756 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3757
3758 if (mdev->sync_conf.al_extents < 7)
3759 mdev->sync_conf.al_extents = 127;
3760
3761 err:
3762 mutex_unlock(&mdev->md_io_mutex);
3763 put_ldev(mdev);
3764
3765 return rv;
3766}
3767
3768/**
3769 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3770 * @mdev: DRBD device.
3771 *
3772 * Call this function if you change anything that should be written to
3773 * the meta-data super block. This function sets MD_DIRTY, and starts a
3774 * timer that ensures that within five seconds you have to call drbd_md_sync().
3775 */
Lars Ellenbergca0e6092010-10-14 15:01:21 +02003776#ifdef DEBUG
Lars Ellenbergee15b032010-09-03 10:00:09 +02003777void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
3778{
3779 if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
3780 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
3781 mdev->last_md_mark_dirty.line = line;
3782 mdev->last_md_mark_dirty.func = func;
3783 }
3784}
3785#else
Philipp Reisnerb411b362009-09-25 16:07:19 -07003786void drbd_md_mark_dirty(struct drbd_conf *mdev)
3787{
Lars Ellenbergee15b032010-09-03 10:00:09 +02003788 if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
Lars Ellenbergca0e6092010-10-14 15:01:21 +02003789 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003790}
Lars Ellenbergee15b032010-09-03 10:00:09 +02003791#endif
Philipp Reisnerb411b362009-09-25 16:07:19 -07003792
3793static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3794{
3795 int i;
3796
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003797 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003798 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003799}
3800
3801void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3802{
3803 if (idx == UI_CURRENT) {
3804 if (mdev->state.role == R_PRIMARY)
3805 val |= 1;
3806 else
3807 val &= ~((u64)1);
3808
3809 drbd_set_ed_uuid(mdev, val);
3810 }
3811
3812 mdev->ldev->md.uuid[idx] = val;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003813 drbd_md_mark_dirty(mdev);
3814}
3815
3816
3817void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3818{
3819 if (mdev->ldev->md.uuid[idx]) {
3820 drbd_uuid_move_history(mdev);
3821 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003822 }
3823 _drbd_uuid_set(mdev, idx, val);
3824}
3825
3826/**
3827 * drbd_uuid_new_current() - Creates a new current UUID
3828 * @mdev: DRBD device.
3829 *
3830 * Creates a new current UUID, and rotates the old current UUID into
3831 * the bitmap slot. Causes an incremental resync upon next connect.
3832 */
3833void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3834{
3835 u64 val;
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003836 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003837
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003838 if (bm_uuid)
3839 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
3840
Philipp Reisnerb411b362009-09-25 16:07:19 -07003841 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
Philipp Reisnerb411b362009-09-25 16:07:19 -07003842
3843 get_random_bytes(&val, sizeof(u64));
3844 _drbd_uuid_set(mdev, UI_CURRENT, val);
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003845 drbd_print_uuids(mdev, "new current UUID");
Lars Ellenbergaaa8e2b2010-10-15 13:16:53 +02003846 /* get it to stable storage _now_ */
3847 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003848}
3849
3850void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3851{
3852 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3853 return;
3854
3855 if (val == 0) {
3856 drbd_uuid_move_history(mdev);
3857 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3858 mdev->ldev->md.uuid[UI_BITMAP] = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003859 } else {
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003860 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
3861 if (bm_uuid)
3862 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003863
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003864 mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003865 }
3866 drbd_md_mark_dirty(mdev);
3867}
3868
3869/**
3870 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3871 * @mdev: DRBD device.
3872 *
3873 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3874 */
3875int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3876{
3877 int rv = -EIO;
3878
3879 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3880 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3881 drbd_md_sync(mdev);
3882 drbd_bm_set_all(mdev);
3883
3884 rv = drbd_bm_write(mdev);
3885
3886 if (!rv) {
3887 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3888 drbd_md_sync(mdev);
3889 }
3890
3891 put_ldev(mdev);
3892 }
3893
3894 return rv;
3895}
3896
3897/**
3898 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3899 * @mdev: DRBD device.
3900 *
3901 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3902 */
3903int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3904{
3905 int rv = -EIO;
3906
Philipp Reisner07782862010-08-31 12:00:50 +02003907 drbd_resume_al(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003908 if (get_ldev_if_state(mdev, D_ATTACHING)) {
3909 drbd_bm_clear_all(mdev);
3910 rv = drbd_bm_write(mdev);
3911 put_ldev(mdev);
3912 }
3913
3914 return rv;
3915}
3916
3917static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3918{
3919 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
Lars Ellenberg02851e92010-12-16 14:47:39 +01003920 int rv = -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003921
3922 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3923
Lars Ellenberg02851e92010-12-16 14:47:39 +01003924 if (get_ldev(mdev)) {
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01003925 drbd_bm_lock(mdev, work->why, work->flags);
Lars Ellenberg02851e92010-12-16 14:47:39 +01003926 rv = work->io_fn(mdev);
3927 drbd_bm_unlock(mdev);
3928 put_ldev(mdev);
3929 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07003930
3931 clear_bit(BITMAP_IO, &mdev->flags);
Philipp Reisner127b3172010-11-16 10:07:53 +01003932 smp_mb__after_clear_bit();
Philipp Reisnerb411b362009-09-25 16:07:19 -07003933 wake_up(&mdev->misc_wait);
3934
3935 if (work->done)
3936 work->done(mdev, rv);
3937
3938 clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3939 work->why = NULL;
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01003940 work->flags = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003941
3942 return 1;
3943}
3944
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02003945void drbd_ldev_destroy(struct drbd_conf *mdev)
3946{
3947 lc_destroy(mdev->resync);
3948 mdev->resync = NULL;
3949 lc_destroy(mdev->act_log);
3950 mdev->act_log = NULL;
3951 __no_warn(local,
3952 drbd_free_bc(mdev->ldev);
3953 mdev->ldev = NULL;);
3954
3955 if (mdev->md_io_tmpp) {
3956 __free_page(mdev->md_io_tmpp);
3957 mdev->md_io_tmpp = NULL;
3958 }
3959 clear_bit(GO_DISKLESS, &mdev->flags);
3960}
3961
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003962static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3963{
3964 D_ASSERT(mdev->state.disk == D_FAILED);
Lars Ellenberg9d282872010-10-14 13:57:07 +02003965 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
3966 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02003967 * the protected members anymore, though, so once put_ldev reaches zero
3968 * again, it will be safe to free them. */
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003969 drbd_force_state(mdev, NS(disk, D_DISKLESS));
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003970 return 1;
3971}
3972
3973void drbd_go_diskless(struct drbd_conf *mdev)
3974{
3975 D_ASSERT(mdev->state.disk == D_FAILED);
3976 if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
Lars Ellenberg9d282872010-10-14 13:57:07 +02003977 drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
Lars Ellenberge9e6f3e2010-09-14 20:26:27 +02003978}
3979
Philipp Reisnerb411b362009-09-25 16:07:19 -07003980/**
3981 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3982 * @mdev: DRBD device.
3983 * @io_fn: IO callback to be called when bitmap IO is possible
3984 * @done: callback to be called after the bitmap IO was performed
3985 * @why: Descriptive text of the reason for doing the IO
3986 *
3987 * While IO on the bitmap happens we freeze application IO thus we ensure
3988 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
3989 * called from worker context. It MUST NOT be used while a previous such
3990 * work is still pending!
3991 */
3992void drbd_queue_bitmap_io(struct drbd_conf *mdev,
3993 int (*io_fn)(struct drbd_conf *),
3994 void (*done)(struct drbd_conf *, int),
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01003995 char *why, enum bm_flag flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003996{
3997 D_ASSERT(current == mdev->worker.task);
3998
3999 D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
4000 D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
4001 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
4002 if (mdev->bm_io_work.why)
4003 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
4004 why, mdev->bm_io_work.why);
4005
4006 mdev->bm_io_work.io_fn = io_fn;
4007 mdev->bm_io_work.done = done;
4008 mdev->bm_io_work.why = why;
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004009 mdev->bm_io_work.flags = flags;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004010
Philipp Reisner22afd7e2010-11-16 15:30:44 +01004011 spin_lock_irq(&mdev->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004012 set_bit(BITMAP_IO, &mdev->flags);
4013 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
Philipp Reisner127b3172010-11-16 10:07:53 +01004014 if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
Philipp Reisnerb411b362009-09-25 16:07:19 -07004015 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004016 }
Philipp Reisner22afd7e2010-11-16 15:30:44 +01004017 spin_unlock_irq(&mdev->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004018}
4019
4020/**
4021 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
4022 * @mdev: DRBD device.
4023 * @io_fn: IO callback to be called when bitmap IO is possible
4024 * @why: Descriptive text of the reason for doing the IO
4025 *
4026 * freezes application IO while that the actual IO operations runs. This
4027 * functions MAY NOT be called from worker context.
4028 */
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004029int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *),
4030 char *why, enum bm_flag flags)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004031{
4032 int rv;
4033
4034 D_ASSERT(current != mdev->worker.task);
4035
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004036 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4037 drbd_suspend_io(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004038
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004039 drbd_bm_lock(mdev, why, flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004040 rv = io_fn(mdev);
4041 drbd_bm_unlock(mdev);
4042
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01004043 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4044 drbd_resume_io(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004045
4046 return rv;
4047}
4048
4049void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4050{
4051 if ((mdev->ldev->md.flags & flag) != flag) {
4052 drbd_md_mark_dirty(mdev);
4053 mdev->ldev->md.flags |= flag;
4054 }
4055}
4056
4057void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4058{
4059 if ((mdev->ldev->md.flags & flag) != 0) {
4060 drbd_md_mark_dirty(mdev);
4061 mdev->ldev->md.flags &= ~flag;
4062 }
4063}
4064int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
4065{
4066 return (bdev->md.flags & flag) != 0;
4067}
4068
4069static void md_sync_timer_fn(unsigned long data)
4070{
4071 struct drbd_conf *mdev = (struct drbd_conf *) data;
4072
4073 drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
4074}
4075
4076static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4077{
4078 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
Lars Ellenbergee15b032010-09-03 10:00:09 +02004079#ifdef DEBUG
4080 dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
4081 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
4082#endif
Philipp Reisnerb411b362009-09-25 16:07:19 -07004083 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004084 return 1;
4085}
4086
4087#ifdef CONFIG_DRBD_FAULT_INJECTION
4088/* Fault insertion support including random number generator shamelessly
4089 * stolen from kernel/rcutorture.c */
4090struct fault_random_state {
4091 unsigned long state;
4092 unsigned long count;
4093};
4094
4095#define FAULT_RANDOM_MULT 39916801 /* prime */
4096#define FAULT_RANDOM_ADD 479001701 /* prime */
4097#define FAULT_RANDOM_REFRESH 10000
4098
4099/*
4100 * Crude but fast random-number generator. Uses a linear congruential
4101 * generator, with occasional help from get_random_bytes().
4102 */
4103static unsigned long
4104_drbd_fault_random(struct fault_random_state *rsp)
4105{
4106 long refresh;
4107
Roel Kluin49829ea2009-12-15 22:55:44 +01004108 if (!rsp->count--) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07004109 get_random_bytes(&refresh, sizeof(refresh));
4110 rsp->state += refresh;
4111 rsp->count = FAULT_RANDOM_REFRESH;
4112 }
4113 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
4114 return swahw32(rsp->state);
4115}
4116
4117static char *
4118_drbd_fault_str(unsigned int type) {
4119 static char *_faults[] = {
4120 [DRBD_FAULT_MD_WR] = "Meta-data write",
4121 [DRBD_FAULT_MD_RD] = "Meta-data read",
4122 [DRBD_FAULT_RS_WR] = "Resync write",
4123 [DRBD_FAULT_RS_RD] = "Resync read",
4124 [DRBD_FAULT_DT_WR] = "Data write",
4125 [DRBD_FAULT_DT_RD] = "Data read",
4126 [DRBD_FAULT_DT_RA] = "Data read ahead",
4127 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
Philipp Reisner6b4388a2010-04-26 14:11:45 +02004128 [DRBD_FAULT_AL_EE] = "EE allocation",
4129 [DRBD_FAULT_RECEIVE] = "receive data corruption",
Philipp Reisnerb411b362009-09-25 16:07:19 -07004130 };
4131
4132 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
4133}
4134
4135unsigned int
4136_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
4137{
4138 static struct fault_random_state rrs = {0, 0};
4139
4140 unsigned int ret = (
4141 (fault_devs == 0 ||
4142 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
4143 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
4144
4145 if (ret) {
4146 fault_count++;
4147
Lars Ellenberg73835062010-05-27 11:51:56 +02004148 if (__ratelimit(&drbd_ratelimit_state))
Philipp Reisnerb411b362009-09-25 16:07:19 -07004149 dev_warn(DEV, "***Simulating %s failure\n",
4150 _drbd_fault_str(type));
4151 }
4152
4153 return ret;
4154}
4155#endif
4156
4157const char *drbd_buildtag(void)
4158{
4159 /* DRBD built from external sources has here a reference to the
4160 git hash of the source code. */
4161
4162 static char buildtag[38] = "\0uilt-in";
4163
4164 if (buildtag[0] == 0) {
4165#ifdef CONFIG_MODULES
4166 if (THIS_MODULE != NULL)
4167 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
4168 else
4169#endif
4170 buildtag[0] = 'b';
4171 }
4172
4173 return buildtag;
4174}
4175
4176module_init(drbd_init)
4177module_exit(drbd_cleanup)
4178
Philipp Reisnerb411b362009-09-25 16:07:19 -07004179EXPORT_SYMBOL(drbd_conn_str);
4180EXPORT_SYMBOL(drbd_role_str);
4181EXPORT_SYMBOL(drbd_disk_str);
4182EXPORT_SYMBOL(drbd_set_st_err_str);