blob: 434adf75259aec168f7133634b682ba9553d435f [file] [log] [blame]
Philipp Reisnerb411b362009-09-25 16:07:19 -07001/*
2 drbd_receiver.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23 */
24
25
Philipp Reisnerb411b362009-09-25 16:07:19 -070026#include <linux/module.h>
27
28#include <asm/uaccess.h>
29#include <net/sock.h>
30
Philipp Reisnerb411b362009-09-25 16:07:19 -070031#include <linux/drbd.h>
32#include <linux/fs.h>
33#include <linux/file.h>
34#include <linux/in.h>
35#include <linux/mm.h>
36#include <linux/memcontrol.h>
37#include <linux/mm_inline.h>
38#include <linux/slab.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070039#include <linux/pkt_sched.h>
40#define __KERNEL_SYSCALLS__
41#include <linux/unistd.h>
42#include <linux/vmalloc.h>
43#include <linux/random.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070044#include <linux/string.h>
45#include <linux/scatterlist.h>
46#include "drbd_int.h"
Philipp Reisnerb411b362009-09-25 16:07:19 -070047#include "drbd_req.h"
48
49#include "drbd_vli.h"
50
Philipp Reisnerb411b362009-09-25 16:07:19 -070051enum finish_epoch {
52 FE_STILL_LIVE,
53 FE_DESTROYED,
54 FE_RECYCLED,
55};
56
57static int drbd_do_handshake(struct drbd_conf *mdev);
58static int drbd_do_auth(struct drbd_conf *mdev);
59
60static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event);
61static int e_end_block(struct drbd_conf *, struct drbd_work *, int);
62
Philipp Reisnerb411b362009-09-25 16:07:19 -070063
64#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
65
Lars Ellenberg45bb9122010-05-14 17:10:48 +020066/*
67 * some helper functions to deal with single linked page lists,
68 * page->private being our "next" pointer.
69 */
70
71/* If at least n pages are linked at head, get n pages off.
72 * Otherwise, don't modify head, and return NULL.
73 * Locking is the responsibility of the caller.
74 */
75static struct page *page_chain_del(struct page **head, int n)
76{
77 struct page *page;
78 struct page *tmp;
79
80 BUG_ON(!n);
81 BUG_ON(!head);
82
83 page = *head;
Philipp Reisner23ce4222010-05-20 13:35:31 +020084
85 if (!page)
86 return NULL;
87
Lars Ellenberg45bb9122010-05-14 17:10:48 +020088 while (page) {
89 tmp = page_chain_next(page);
90 if (--n == 0)
91 break; /* found sufficient pages */
92 if (tmp == NULL)
93 /* insufficient pages, don't use any of them. */
94 return NULL;
95 page = tmp;
96 }
97
98 /* add end of list marker for the returned list */
99 set_page_private(page, 0);
100 /* actual return value, and adjustment of head */
101 page = *head;
102 *head = tmp;
103 return page;
104}
105
106/* may be used outside of locks to find the tail of a (usually short)
107 * "private" page chain, before adding it back to a global chain head
108 * with page_chain_add() under a spinlock. */
109static struct page *page_chain_tail(struct page *page, int *len)
110{
111 struct page *tmp;
112 int i = 1;
113 while ((tmp = page_chain_next(page)))
114 ++i, page = tmp;
115 if (len)
116 *len = i;
117 return page;
118}
119
120static int page_chain_free(struct page *page)
121{
122 struct page *tmp;
123 int i = 0;
124 page_chain_for_each_safe(page, tmp) {
125 put_page(page);
126 ++i;
127 }
128 return i;
129}
130
131static void page_chain_add(struct page **head,
132 struct page *chain_first, struct page *chain_last)
133{
134#if 1
135 struct page *tmp;
136 tmp = page_chain_tail(chain_first, NULL);
137 BUG_ON(tmp != chain_last);
138#endif
139
140 /* add chain to head */
141 set_page_private(chain_last, (unsigned long)*head);
142 *head = chain_first;
143}
144
145static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int number)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700146{
147 struct page *page = NULL;
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200148 struct page *tmp = NULL;
149 int i = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700150
151 /* Yes, testing drbd_pp_vacant outside the lock is racy.
152 * So what. It saves a spin_lock. */
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200153 if (drbd_pp_vacant >= number) {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700154 spin_lock(&drbd_pp_lock);
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200155 page = page_chain_del(&drbd_pp_pool, number);
156 if (page)
157 drbd_pp_vacant -= number;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700158 spin_unlock(&drbd_pp_lock);
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200159 if (page)
160 return page;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700161 }
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200162
Philipp Reisnerb411b362009-09-25 16:07:19 -0700163 /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
164 * "criss-cross" setup, that might cause write-out on some other DRBD,
165 * which in turn might block on the other node at this very place. */
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200166 for (i = 0; i < number; i++) {
167 tmp = alloc_page(GFP_TRY);
168 if (!tmp)
169 break;
170 set_page_private(tmp, (unsigned long)page);
171 page = tmp;
172 }
173
174 if (i == number)
175 return page;
176
177 /* Not enough pages immediately available this time.
178 * No need to jump around here, drbd_pp_alloc will retry this
179 * function "soon". */
180 if (page) {
181 tmp = page_chain_tail(page, NULL);
182 spin_lock(&drbd_pp_lock);
183 page_chain_add(&drbd_pp_pool, page, tmp);
184 drbd_pp_vacant += i;
185 spin_unlock(&drbd_pp_lock);
186 }
187 return NULL;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700188}
189
Philipp Reisnerb411b362009-09-25 16:07:19 -0700190static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed)
191{
192 struct drbd_epoch_entry *e;
193 struct list_head *le, *tle;
194
195 /* The EEs are always appended to the end of the list. Since
196 they are sent in order over the wire, they have to finish
197 in order. As soon as we see the first not finished we can
198 stop to examine the list... */
199
200 list_for_each_safe(le, tle, &mdev->net_ee) {
201 e = list_entry(le, struct drbd_epoch_entry, w.list);
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200202 if (drbd_ee_has_active_page(e))
Philipp Reisnerb411b362009-09-25 16:07:19 -0700203 break;
204 list_move(le, to_be_freed);
205 }
206}
207
208static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
209{
210 LIST_HEAD(reclaimed);
211 struct drbd_epoch_entry *e, *t;
212
Philipp Reisnerb411b362009-09-25 16:07:19 -0700213 spin_lock_irq(&mdev->req_lock);
214 reclaim_net_ee(mdev, &reclaimed);
215 spin_unlock_irq(&mdev->req_lock);
216
217 list_for_each_entry_safe(e, t, &reclaimed, w.list)
Lars Ellenberg435f0742010-09-06 12:30:25 +0200218 drbd_free_net_ee(mdev, e);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700219}
220
221/**
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200222 * drbd_pp_alloc() - Returns @number pages, retries forever (or until signalled)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700223 * @mdev: DRBD device.
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200224 * @number: number of pages requested
225 * @retry: whether to retry, if not enough pages are available right now
Philipp Reisnerb411b362009-09-25 16:07:19 -0700226 *
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200227 * Tries to allocate number pages, first from our own page pool, then from
228 * the kernel, unless this allocation would exceed the max_buffers setting.
229 * Possibly retry until DRBD frees sufficient pages somewhere else.
230 *
231 * Returns a page chain linked via page->private.
Philipp Reisnerb411b362009-09-25 16:07:19 -0700232 */
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200233static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool retry)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700234{
235 struct page *page = NULL;
236 DEFINE_WAIT(wait);
237
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200238 /* Yes, we may run up to @number over max_buffers. If we
239 * follow it strictly, the admin will get it wrong anyways. */
240 if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers)
241 page = drbd_pp_first_pages_or_try_alloc(mdev, number);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700242
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200243 while (page == NULL) {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700244 prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
245
246 drbd_kick_lo_and_reclaim_net(mdev);
247
248 if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) {
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200249 page = drbd_pp_first_pages_or_try_alloc(mdev, number);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700250 if (page)
251 break;
252 }
253
254 if (!retry)
255 break;
256
257 if (signal_pending(current)) {
258 dev_warn(DEV, "drbd_pp_alloc interrupted!\n");
259 break;
260 }
261
262 schedule();
263 }
264 finish_wait(&drbd_pp_wait, &wait);
265
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200266 if (page)
267 atomic_add(number, &mdev->pp_in_use);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700268 return page;
269}
270
271/* Must not be used from irq, as that may deadlock: see drbd_pp_alloc.
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200272 * Is also used from inside an other spin_lock_irq(&mdev->req_lock);
273 * Either links the page chain back to the global pool,
274 * or returns all pages to the system. */
Lars Ellenberg435f0742010-09-06 12:30:25 +0200275static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700276{
Lars Ellenberg435f0742010-09-06 12:30:25 +0200277 atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700278 int i;
Lars Ellenberg435f0742010-09-06 12:30:25 +0200279
Lars Ellenberga73ff322012-06-25 19:15:38 +0200280 if (page == NULL)
281 return;
282
Lars Ellenberg1816a2b2010-11-11 15:19:07 +0100283 if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE)*minor_count)
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200284 i = page_chain_free(page);
285 else {
286 struct page *tmp;
287 tmp = page_chain_tail(page, &i);
288 spin_lock(&drbd_pp_lock);
289 page_chain_add(&drbd_pp_pool, page, tmp);
290 drbd_pp_vacant += i;
291 spin_unlock(&drbd_pp_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700292 }
Lars Ellenberg435f0742010-09-06 12:30:25 +0200293 i = atomic_sub_return(i, a);
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200294 if (i < 0)
Lars Ellenberg435f0742010-09-06 12:30:25 +0200295 dev_warn(DEV, "ASSERTION FAILED: %s: %d < 0\n",
296 is_net ? "pp_in_use_by_net" : "pp_in_use", i);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700297 wake_up(&drbd_pp_wait);
298}
299
300/*
301You need to hold the req_lock:
302 _drbd_wait_ee_list_empty()
303
304You must not have the req_lock:
305 drbd_free_ee()
306 drbd_alloc_ee()
307 drbd_init_ee()
308 drbd_release_ee()
309 drbd_ee_fix_bhs()
310 drbd_process_done_ee()
311 drbd_clear_done_ee()
312 drbd_wait_ee_list_empty()
313*/
314
315struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
316 u64 id,
317 sector_t sector,
318 unsigned int data_size,
319 gfp_t gfp_mask) __must_hold(local)
320{
Philipp Reisnerb411b362009-09-25 16:07:19 -0700321 struct drbd_epoch_entry *e;
Lars Ellenberga73ff322012-06-25 19:15:38 +0200322 struct page *page = NULL;
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200323 unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700324
Andreas Gruenbacher0cf9d272010-12-07 10:43:29 +0100325 if (drbd_insert_fault(mdev, DRBD_FAULT_AL_EE))
Philipp Reisnerb411b362009-09-25 16:07:19 -0700326 return NULL;
327
328 e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
329 if (!e) {
330 if (!(gfp_mask & __GFP_NOWARN))
331 dev_err(DEV, "alloc_ee: Allocation of an EE failed\n");
332 return NULL;
333 }
334
Lars Ellenberga73ff322012-06-25 19:15:38 +0200335 if (data_size) {
336 page = drbd_pp_alloc(mdev, nr_pages, (gfp_mask & __GFP_WAIT));
337 if (!page)
338 goto fail;
339 }
Philipp Reisnerb411b362009-09-25 16:07:19 -0700340
Bart Van Assche24c48302011-05-21 18:32:29 +0200341 INIT_HLIST_NODE(&e->collision);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700342 e->epoch = NULL;
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200343 e->mdev = mdev;
344 e->pages = page;
345 atomic_set(&e->pending_bios, 0);
346 e->size = data_size;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700347 e->flags = 0;
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200348 e->sector = sector;
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200349 e->block_id = id;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700350
Philipp Reisnerb411b362009-09-25 16:07:19 -0700351 return e;
352
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200353 fail:
Philipp Reisnerb411b362009-09-25 16:07:19 -0700354 mempool_free(e, drbd_ee_mempool);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700355 return NULL;
356}
357
Lars Ellenberg435f0742010-09-06 12:30:25 +0200358void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, int is_net)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700359{
Lars Ellenbergc36c3ce2010-08-11 20:42:55 +0200360 if (e->flags & EE_HAS_DIGEST)
361 kfree(e->digest);
Lars Ellenberg435f0742010-09-06 12:30:25 +0200362 drbd_pp_free(mdev, e->pages, is_net);
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200363 D_ASSERT(atomic_read(&e->pending_bios) == 0);
Bart Van Assche24c48302011-05-21 18:32:29 +0200364 D_ASSERT(hlist_unhashed(&e->collision));
Philipp Reisnerb411b362009-09-25 16:07:19 -0700365 mempool_free(e, drbd_ee_mempool);
366}
367
368int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list)
369{
370 LIST_HEAD(work_list);
371 struct drbd_epoch_entry *e, *t;
372 int count = 0;
Lars Ellenberg435f0742010-09-06 12:30:25 +0200373 int is_net = list == &mdev->net_ee;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700374
375 spin_lock_irq(&mdev->req_lock);
376 list_splice_init(list, &work_list);
377 spin_unlock_irq(&mdev->req_lock);
378
379 list_for_each_entry_safe(e, t, &work_list, w.list) {
Lars Ellenberg435f0742010-09-06 12:30:25 +0200380 drbd_free_some_ee(mdev, e, is_net);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700381 count++;
382 }
383 return count;
384}
385
386
387/*
388 * This function is called from _asender only_
389 * but see also comments in _req_mod(,barrier_acked)
390 * and receive_Barrier.
391 *
392 * Move entries from net_ee to done_ee, if ready.
393 * Grab done_ee, call all callbacks, free the entries.
394 * The callbacks typically send out ACKs.
395 */
396static int drbd_process_done_ee(struct drbd_conf *mdev)
397{
398 LIST_HEAD(work_list);
399 LIST_HEAD(reclaimed);
400 struct drbd_epoch_entry *e, *t;
401 int ok = (mdev->state.conn >= C_WF_REPORT_PARAMS);
402
403 spin_lock_irq(&mdev->req_lock);
404 reclaim_net_ee(mdev, &reclaimed);
405 list_splice_init(&mdev->done_ee, &work_list);
406 spin_unlock_irq(&mdev->req_lock);
407
408 list_for_each_entry_safe(e, t, &reclaimed, w.list)
Lars Ellenberg435f0742010-09-06 12:30:25 +0200409 drbd_free_net_ee(mdev, e);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700410
411 /* possible callbacks here:
412 * e_end_block, and e_end_resync_block, e_send_discard_ack.
413 * all ignore the last argument.
414 */
415 list_for_each_entry_safe(e, t, &work_list, w.list) {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700416 /* list_del not necessary, next/prev members not touched */
417 ok = e->w.cb(mdev, &e->w, !ok) && ok;
418 drbd_free_ee(mdev, e);
419 }
420 wake_up(&mdev->ee_wait);
421
422 return ok;
423}
424
425void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
426{
427 DEFINE_WAIT(wait);
428
429 /* avoids spin_lock/unlock
430 * and calling prepare_to_wait in the fast path */
431 while (!list_empty(head)) {
432 prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
433 spin_unlock_irq(&mdev->req_lock);
Jens Axboe7eaceac2011-03-10 08:52:07 +0100434 io_schedule();
Philipp Reisnerb411b362009-09-25 16:07:19 -0700435 finish_wait(&mdev->ee_wait, &wait);
436 spin_lock_irq(&mdev->req_lock);
437 }
438}
439
440void drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
441{
442 spin_lock_irq(&mdev->req_lock);
443 _drbd_wait_ee_list_empty(mdev, head);
444 spin_unlock_irq(&mdev->req_lock);
445}
446
447/* see also kernel_accept; which is only present since 2.6.18.
448 * also we want to log which part of it failed, exactly */
449static int drbd_accept(struct drbd_conf *mdev, const char **what,
450 struct socket *sock, struct socket **newsock)
451{
452 struct sock *sk = sock->sk;
453 int err = 0;
454
455 *what = "listen";
456 err = sock->ops->listen(sock, 5);
457 if (err < 0)
458 goto out;
459
460 *what = "sock_create_lite";
461 err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol,
462 newsock);
463 if (err < 0)
464 goto out;
465
466 *what = "accept";
467 err = sock->ops->accept(sock, *newsock, 0);
468 if (err < 0) {
469 sock_release(*newsock);
470 *newsock = NULL;
471 goto out;
472 }
473 (*newsock)->ops = sock->ops;
Lars Ellenberg47a4f1c2012-01-12 23:01:26 +0100474 __module_get((*newsock)->ops->owner);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700475
476out:
477 return err;
478}
479
480static int drbd_recv_short(struct drbd_conf *mdev, struct socket *sock,
481 void *buf, size_t size, int flags)
482{
483 mm_segment_t oldfs;
484 struct kvec iov = {
485 .iov_base = buf,
486 .iov_len = size,
487 };
488 struct msghdr msg = {
489 .msg_iovlen = 1,
490 .msg_iov = (struct iovec *)&iov,
491 .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL)
492 };
493 int rv;
494
495 oldfs = get_fs();
496 set_fs(KERNEL_DS);
497 rv = sock_recvmsg(sock, &msg, size, msg.msg_flags);
498 set_fs(oldfs);
499
500 return rv;
501}
502
503static int drbd_recv(struct drbd_conf *mdev, void *buf, size_t size)
504{
505 mm_segment_t oldfs;
506 struct kvec iov = {
507 .iov_base = buf,
508 .iov_len = size,
509 };
510 struct msghdr msg = {
511 .msg_iovlen = 1,
512 .msg_iov = (struct iovec *)&iov,
513 .msg_flags = MSG_WAITALL | MSG_NOSIGNAL
514 };
515 int rv;
516
517 oldfs = get_fs();
518 set_fs(KERNEL_DS);
519
520 for (;;) {
521 rv = sock_recvmsg(mdev->data.socket, &msg, size, msg.msg_flags);
522 if (rv == size)
523 break;
524
525 /* Note:
526 * ECONNRESET other side closed the connection
527 * ERESTARTSYS (on sock) we got a signal
528 */
529
530 if (rv < 0) {
531 if (rv == -ECONNRESET)
532 dev_info(DEV, "sock was reset by peer\n");
533 else if (rv != -ERESTARTSYS)
534 dev_err(DEV, "sock_recvmsg returned %d\n", rv);
535 break;
536 } else if (rv == 0) {
537 dev_info(DEV, "sock was shut down by peer\n");
538 break;
539 } else {
540 /* signal came in, or peer/link went down,
541 * after we read a partial message
542 */
543 /* D_ASSERT(signal_pending(current)); */
544 break;
545 }
546 };
547
548 set_fs(oldfs);
549
550 if (rv != size)
551 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
552
553 return rv;
554}
555
Lars Ellenberg5dbf1672010-05-25 16:18:01 +0200556/* quoting tcp(7):
557 * On individual connections, the socket buffer size must be set prior to the
558 * listen(2) or connect(2) calls in order to have it take effect.
559 * This is our wrapper to do so.
560 */
561static void drbd_setbufsize(struct socket *sock, unsigned int snd,
562 unsigned int rcv)
563{
564 /* open coded SO_SNDBUF, SO_RCVBUF */
565 if (snd) {
566 sock->sk->sk_sndbuf = snd;
567 sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
568 }
569 if (rcv) {
570 sock->sk->sk_rcvbuf = rcv;
571 sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
572 }
573}
574
Philipp Reisnerb411b362009-09-25 16:07:19 -0700575static struct socket *drbd_try_connect(struct drbd_conf *mdev)
576{
577 const char *what;
578 struct socket *sock;
579 struct sockaddr_in6 src_in6;
580 int err;
581 int disconnect_on_error = 1;
582
583 if (!get_net_conf(mdev))
584 return NULL;
585
586 what = "sock_create_kern";
587 err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
588 SOCK_STREAM, IPPROTO_TCP, &sock);
589 if (err < 0) {
590 sock = NULL;
591 goto out;
592 }
593
594 sock->sk->sk_rcvtimeo =
595 sock->sk->sk_sndtimeo = mdev->net_conf->try_connect_int*HZ;
Lars Ellenberg5dbf1672010-05-25 16:18:01 +0200596 drbd_setbufsize(sock, mdev->net_conf->sndbuf_size,
597 mdev->net_conf->rcvbuf_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700598
599 /* explicitly bind to the configured IP as source IP
600 * for the outgoing connections.
601 * This is needed for multihomed hosts and to be
602 * able to use lo: interfaces for drbd.
603 * Make sure to use 0 as port number, so linux selects
604 * a free one dynamically.
605 */
606 memcpy(&src_in6, mdev->net_conf->my_addr,
607 min_t(int, mdev->net_conf->my_addr_len, sizeof(src_in6)));
608 if (((struct sockaddr *)mdev->net_conf->my_addr)->sa_family == AF_INET6)
609 src_in6.sin6_port = 0;
610 else
611 ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
612
613 what = "bind before connect";
614 err = sock->ops->bind(sock,
615 (struct sockaddr *) &src_in6,
616 mdev->net_conf->my_addr_len);
617 if (err < 0)
618 goto out;
619
620 /* connect may fail, peer not yet available.
621 * stay C_WF_CONNECTION, don't go Disconnecting! */
622 disconnect_on_error = 0;
623 what = "connect";
624 err = sock->ops->connect(sock,
625 (struct sockaddr *)mdev->net_conf->peer_addr,
626 mdev->net_conf->peer_addr_len, 0);
627
628out:
629 if (err < 0) {
630 if (sock) {
631 sock_release(sock);
632 sock = NULL;
633 }
634 switch (-err) {
635 /* timeout, busy, signal pending */
636 case ETIMEDOUT: case EAGAIN: case EINPROGRESS:
637 case EINTR: case ERESTARTSYS:
638 /* peer not (yet) available, network problem */
639 case ECONNREFUSED: case ENETUNREACH:
640 case EHOSTDOWN: case EHOSTUNREACH:
641 disconnect_on_error = 0;
642 break;
643 default:
644 dev_err(DEV, "%s failed, err = %d\n", what, err);
645 }
646 if (disconnect_on_error)
647 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
648 }
649 put_net_conf(mdev);
650 return sock;
651}
652
653static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev)
654{
655 int timeo, err;
656 struct socket *s_estab = NULL, *s_listen;
657 const char *what;
658
659 if (!get_net_conf(mdev))
660 return NULL;
661
662 what = "sock_create_kern";
663 err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
664 SOCK_STREAM, IPPROTO_TCP, &s_listen);
665 if (err) {
666 s_listen = NULL;
667 goto out;
668 }
669
670 timeo = mdev->net_conf->try_connect_int * HZ;
671 timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */
672
Pavel Emelyanov4a17fd52012-04-19 03:39:36 +0000673 s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700674 s_listen->sk->sk_rcvtimeo = timeo;
675 s_listen->sk->sk_sndtimeo = timeo;
Lars Ellenberg5dbf1672010-05-25 16:18:01 +0200676 drbd_setbufsize(s_listen, mdev->net_conf->sndbuf_size,
677 mdev->net_conf->rcvbuf_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700678
679 what = "bind before listen";
680 err = s_listen->ops->bind(s_listen,
681 (struct sockaddr *) mdev->net_conf->my_addr,
682 mdev->net_conf->my_addr_len);
683 if (err < 0)
684 goto out;
685
686 err = drbd_accept(mdev, &what, s_listen, &s_estab);
687
688out:
689 if (s_listen)
690 sock_release(s_listen);
691 if (err < 0) {
692 if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
693 dev_err(DEV, "%s failed, err = %d\n", what, err);
694 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
695 }
696 }
697 put_net_conf(mdev);
698
699 return s_estab;
700}
701
702static int drbd_send_fp(struct drbd_conf *mdev,
703 struct socket *sock, enum drbd_packets cmd)
704{
Philipp Reisner02918be2010-08-20 14:35:10 +0200705 struct p_header80 *h = &mdev->data.sbuf.header.h80;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700706
707 return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0);
708}
709
710static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock)
711{
Philipp Reisner02918be2010-08-20 14:35:10 +0200712 struct p_header80 *h = &mdev->data.rbuf.header.h80;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700713 int rr;
714
715 rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0);
716
717 if (rr == sizeof(*h) && h->magic == BE_DRBD_MAGIC)
718 return be16_to_cpu(h->command);
719
720 return 0xffff;
721}
722
723/**
724 * drbd_socket_okay() - Free the socket if its connection is not okay
725 * @mdev: DRBD device.
726 * @sock: pointer to the pointer to the socket.
727 */
728static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock)
729{
730 int rr;
731 char tb[4];
732
733 if (!*sock)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +0100734 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700735
736 rr = drbd_recv_short(mdev, *sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
737
738 if (rr > 0 || rr == -EAGAIN) {
Andreas Gruenbacher81e84652010-12-09 15:03:57 +0100739 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700740 } else {
741 sock_release(*sock);
742 *sock = NULL;
Andreas Gruenbacher81e84652010-12-09 15:03:57 +0100743 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700744 }
745}
746
747/*
748 * return values:
749 * 1 yes, we have a valid connection
750 * 0 oops, did not work out, please try again
751 * -1 peer talks different language,
752 * no point in trying again, please go standalone.
753 * -2 We do not have a network config...
754 */
755static int drbd_connect(struct drbd_conf *mdev)
756{
757 struct socket *s, *sock, *msock;
758 int try, h, ok;
Philipp Reisner197296f2012-03-26 16:47:11 +0200759 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700760
761 D_ASSERT(!mdev->data.socket);
762
Philipp Reisnerb411b362009-09-25 16:07:19 -0700763 if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS)
764 return -2;
765
766 clear_bit(DISCARD_CONCURRENT, &mdev->flags);
767
768 sock = NULL;
769 msock = NULL;
770
771 do {
772 for (try = 0;;) {
773 /* 3 tries, this should take less than a second! */
774 s = drbd_try_connect(mdev);
775 if (s || ++try >= 3)
776 break;
777 /* give the other side time to call bind() & listen() */
Philipp Reisner20ee6392011-01-18 15:28:59 +0100778 schedule_timeout_interruptible(HZ / 10);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700779 }
780
781 if (s) {
782 if (!sock) {
783 drbd_send_fp(mdev, s, P_HAND_SHAKE_S);
784 sock = s;
785 s = NULL;
786 } else if (!msock) {
787 drbd_send_fp(mdev, s, P_HAND_SHAKE_M);
788 msock = s;
789 s = NULL;
790 } else {
791 dev_err(DEV, "Logic error in drbd_connect()\n");
792 goto out_release_sockets;
793 }
794 }
795
796 if (sock && msock) {
Philipp Reisnera8e40792011-05-13 12:03:55 +0200797 schedule_timeout_interruptible(mdev->net_conf->ping_timeo*HZ/10);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700798 ok = drbd_socket_okay(mdev, &sock);
799 ok = drbd_socket_okay(mdev, &msock) && ok;
800 if (ok)
801 break;
802 }
803
804retry:
805 s = drbd_wait_for_connect(mdev);
806 if (s) {
807 try = drbd_recv_fp(mdev, s);
808 drbd_socket_okay(mdev, &sock);
809 drbd_socket_okay(mdev, &msock);
810 switch (try) {
811 case P_HAND_SHAKE_S:
812 if (sock) {
813 dev_warn(DEV, "initial packet S crossed\n");
814 sock_release(sock);
815 }
816 sock = s;
817 break;
818 case P_HAND_SHAKE_M:
819 if (msock) {
820 dev_warn(DEV, "initial packet M crossed\n");
821 sock_release(msock);
822 }
823 msock = s;
824 set_bit(DISCARD_CONCURRENT, &mdev->flags);
825 break;
826 default:
827 dev_warn(DEV, "Error receiving initial packet\n");
828 sock_release(s);
829 if (random32() & 1)
830 goto retry;
831 }
832 }
833
834 if (mdev->state.conn <= C_DISCONNECTING)
835 goto out_release_sockets;
836 if (signal_pending(current)) {
837 flush_signals(current);
838 smp_rmb();
839 if (get_t_state(&mdev->receiver) == Exiting)
840 goto out_release_sockets;
841 }
842
843 if (sock && msock) {
844 ok = drbd_socket_okay(mdev, &sock);
845 ok = drbd_socket_okay(mdev, &msock) && ok;
846 if (ok)
847 break;
848 }
849 } while (1);
850
Pavel Emelyanov4a17fd52012-04-19 03:39:36 +0000851 msock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
852 sock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700853
854 sock->sk->sk_allocation = GFP_NOIO;
855 msock->sk->sk_allocation = GFP_NOIO;
856
857 sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
858 msock->sk->sk_priority = TC_PRIO_INTERACTIVE;
859
Philipp Reisnerb411b362009-09-25 16:07:19 -0700860 /* NOT YET ...
861 * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
862 * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
863 * first set it to the P_HAND_SHAKE timeout,
864 * which we set to 4x the configured ping_timeout. */
865 sock->sk->sk_sndtimeo =
866 sock->sk->sk_rcvtimeo = mdev->net_conf->ping_timeo*4*HZ/10;
867
868 msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
869 msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
870
871 /* we don't want delays.
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300872 * we use TCP_CORK where appropriate, though */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700873 drbd_tcp_nodelay(sock);
874 drbd_tcp_nodelay(msock);
875
876 mdev->data.socket = sock;
877 mdev->meta.socket = msock;
878 mdev->last_received = jiffies;
879
880 D_ASSERT(mdev->asender.task == NULL);
881
882 h = drbd_do_handshake(mdev);
883 if (h <= 0)
884 return h;
885
886 if (mdev->cram_hmac_tfm) {
887 /* drbd_request_state(mdev, NS(conn, WFAuth)); */
Johannes Thomab10d96c2010-01-07 16:02:50 +0100888 switch (drbd_do_auth(mdev)) {
889 case -1:
Philipp Reisnerb411b362009-09-25 16:07:19 -0700890 dev_err(DEV, "Authentication of peer failed\n");
891 return -1;
Johannes Thomab10d96c2010-01-07 16:02:50 +0100892 case 0:
893 dev_err(DEV, "Authentication of peer failed, trying again.\n");
894 return 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700895 }
896 }
897
Philipp Reisnerb411b362009-09-25 16:07:19 -0700898 sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
899 sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
900
901 atomic_set(&mdev->packet_seq, 0);
902 mdev->peer_seq = 0;
903
Philipp Reisner148efa12011-01-15 00:21:15 +0100904 if (drbd_send_protocol(mdev) == -1)
Philipp Reisner7e2455c2010-04-22 14:50:23 +0200905 return -1;
Philipp Reisner197296f2012-03-26 16:47:11 +0200906 set_bit(STATE_SENT, &mdev->flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700907 drbd_send_sync_param(mdev, &mdev->sync_conf);
Philipp Reisnere89b5912010-03-24 17:11:33 +0100908 drbd_send_sizes(mdev, 0, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700909 drbd_send_uuids(mdev);
Lars Ellenbergf479ea02011-10-27 16:52:30 +0200910 drbd_send_current_state(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700911 clear_bit(USE_DEGR_WFC_T, &mdev->flags);
912 clear_bit(RESIZE_PENDING, &mdev->flags);
Philipp Reisner1e86ac42011-08-04 10:33:08 +0200913
Philipp Reisner197296f2012-03-26 16:47:11 +0200914 spin_lock_irq(&mdev->req_lock);
915 rv = _drbd_set_state(_NS(mdev, conn, C_WF_REPORT_PARAMS), CS_VERBOSE, NULL);
916 if (mdev->state.conn != C_WF_REPORT_PARAMS)
917 clear_bit(STATE_SENT, &mdev->flags);
918 spin_unlock_irq(&mdev->req_lock);
919
920 if (rv < SS_SUCCESS)
Philipp Reisner1e86ac42011-08-04 10:33:08 +0200921 return 0;
922
923 drbd_thread_start(&mdev->asender);
Philipp Reisner7fde2be2011-03-01 11:08:28 +0100924 mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */
Philipp Reisnerb411b362009-09-25 16:07:19 -0700925
926 return 1;
927
928out_release_sockets:
929 if (sock)
930 sock_release(sock);
931 if (msock)
932 sock_release(msock);
933 return -1;
934}
935
Philipp Reisner02918be2010-08-20 14:35:10 +0200936static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsigned int *packet_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700937{
Philipp Reisner02918be2010-08-20 14:35:10 +0200938 union p_header *h = &mdev->data.rbuf.header;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700939 int r;
940
941 r = drbd_recv(mdev, h, sizeof(*h));
Philipp Reisnerb411b362009-09-25 16:07:19 -0700942 if (unlikely(r != sizeof(*h))) {
Lars Ellenberg0ddc5542011-01-21 12:35:15 +0100943 if (!signal_pending(current))
944 dev_warn(DEV, "short read expecting header on sock: r=%d\n", r);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +0100945 return false;
Philipp Reisner02918be2010-08-20 14:35:10 +0200946 }
947
948 if (likely(h->h80.magic == BE_DRBD_MAGIC)) {
949 *cmd = be16_to_cpu(h->h80.command);
950 *packet_size = be16_to_cpu(h->h80.length);
951 } else if (h->h95.magic == BE_DRBD_MAGIC_BIG) {
952 *cmd = be16_to_cpu(h->h95.command);
953 *packet_size = be32_to_cpu(h->h95.length);
954 } else {
Lars Ellenberg004352f2010-10-05 20:13:58 +0200955 dev_err(DEV, "magic?? on data m: 0x%08x c: %d l: %d\n",
956 be32_to_cpu(h->h80.magic),
957 be16_to_cpu(h->h80.command),
958 be16_to_cpu(h->h80.length));
Andreas Gruenbacher81e84652010-12-09 15:03:57 +0100959 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700960 }
961 mdev->last_received = jiffies;
962
Andreas Gruenbacher81e84652010-12-09 15:03:57 +0100963 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700964}
965
Philipp Reisner2451fc32010-08-24 13:43:11 +0200966static void drbd_flush(struct drbd_conf *mdev)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700967{
968 int rv;
969
970 if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
Dmitry Monakhovfbd9b092010-04-28 17:55:06 +0400971 rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL,
Christoph Hellwigdd3932e2010-09-16 20:51:46 +0200972 NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700973 if (rv) {
Philipp Reisnerebd2b0c2011-05-25 11:03:04 +0200974 dev_info(DEV, "local disk flush failed with status %d\n", rv);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700975 /* would rather check on EOPNOTSUPP, but that is not reliable.
976 * don't try again for ANY return value != 0
977 * if (rv == -EOPNOTSUPP) */
978 drbd_bump_write_ordering(mdev, WO_drain_io);
979 }
980 put_ldev(mdev);
981 }
Philipp Reisnerb411b362009-09-25 16:07:19 -0700982}
983
984/**
985 * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it.
986 * @mdev: DRBD device.
987 * @epoch: Epoch object.
988 * @ev: Epoch event.
989 */
990static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
991 struct drbd_epoch *epoch,
992 enum epoch_event ev)
993{
Philipp Reisner2451fc32010-08-24 13:43:11 +0200994 int epoch_size;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700995 struct drbd_epoch *next_epoch;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700996 enum finish_epoch rv = FE_STILL_LIVE;
997
998 spin_lock(&mdev->epoch_lock);
999 do {
1000 next_epoch = NULL;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001001
1002 epoch_size = atomic_read(&epoch->epoch_size);
1003
1004 switch (ev & ~EV_CLEANUP) {
1005 case EV_PUT:
1006 atomic_dec(&epoch->active);
1007 break;
1008 case EV_GOT_BARRIER_NR:
1009 set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001010 break;
1011 case EV_BECAME_LAST:
1012 /* nothing to do*/
1013 break;
1014 }
1015
Philipp Reisnerb411b362009-09-25 16:07:19 -07001016 if (epoch_size != 0 &&
1017 atomic_read(&epoch->active) == 0 &&
Philipp Reisner80f9fd52011-07-18 15:45:15 +02001018 (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001019 if (!(ev & EV_CLEANUP)) {
1020 spin_unlock(&mdev->epoch_lock);
1021 drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size);
1022 spin_lock(&mdev->epoch_lock);
1023 }
Philipp Reisner80f9fd52011-07-18 15:45:15 +02001024 if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags))
1025 dec_unacked(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001026
1027 if (mdev->current_epoch != epoch) {
1028 next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
1029 list_del(&epoch->list);
1030 ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
1031 mdev->epochs--;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001032 kfree(epoch);
1033
1034 if (rv == FE_STILL_LIVE)
1035 rv = FE_DESTROYED;
1036 } else {
1037 epoch->flags = 0;
1038 atomic_set(&epoch->epoch_size, 0);
Uwe Kleine-König698f9312010-07-02 20:41:51 +02001039 /* atomic_set(&epoch->active, 0); is already zero */
Philipp Reisnerb411b362009-09-25 16:07:19 -07001040 if (rv == FE_STILL_LIVE)
1041 rv = FE_RECYCLED;
Philipp Reisner2451fc32010-08-24 13:43:11 +02001042 wake_up(&mdev->ee_wait);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001043 }
1044 }
1045
1046 if (!next_epoch)
1047 break;
1048
1049 epoch = next_epoch;
1050 } while (1);
1051
1052 spin_unlock(&mdev->epoch_lock);
1053
Philipp Reisnerb411b362009-09-25 16:07:19 -07001054 return rv;
1055}
1056
1057/**
1058 * drbd_bump_write_ordering() - Fall back to an other write ordering method
1059 * @mdev: DRBD device.
1060 * @wo: Write ordering method to try.
1061 */
1062void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) __must_hold(local)
1063{
1064 enum write_ordering_e pwo;
1065 static char *write_ordering_str[] = {
1066 [WO_none] = "none",
1067 [WO_drain_io] = "drain",
1068 [WO_bdev_flush] = "flush",
Philipp Reisnerb411b362009-09-25 16:07:19 -07001069 };
1070
1071 pwo = mdev->write_ordering;
1072 wo = min(pwo, wo);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001073 if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush)
1074 wo = WO_drain_io;
1075 if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain)
1076 wo = WO_none;
1077 mdev->write_ordering = wo;
Philipp Reisner2451fc32010-08-24 13:43:11 +02001078 if (pwo != mdev->write_ordering || wo == WO_bdev_flush)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001079 dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]);
1080}
1081
1082/**
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001083 * drbd_submit_ee()
1084 * @mdev: DRBD device.
1085 * @e: epoch entry
1086 * @rw: flag field, see bio->bi_rw
Lars Ellenberg10f6d9922011-01-24 14:47:09 +01001087 *
1088 * May spread the pages to multiple bios,
1089 * depending on bio_add_page restrictions.
1090 *
1091 * Returns 0 if all bios have been submitted,
1092 * -ENOMEM if we could not allocate enough bios,
1093 * -ENOSPC (any better suggestion?) if we have not been able to bio_add_page a
1094 * single page to an empty bio (which should never happen and likely indicates
1095 * that the lower level IO stack is in some way broken). This has been observed
1096 * on certain Xen deployments.
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001097 */
1098/* TODO allocate from our own bio_set. */
1099int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
1100 const unsigned rw, const int fault_type)
1101{
1102 struct bio *bios = NULL;
1103 struct bio *bio;
1104 struct page *page = e->pages;
1105 sector_t sector = e->sector;
1106 unsigned ds = e->size;
1107 unsigned n_bios = 0;
1108 unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
Lars Ellenberg10f6d9922011-01-24 14:47:09 +01001109 int err = -ENOMEM;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001110
1111 /* In most cases, we will only need one bio. But in case the lower
1112 * level restrictions happen to be different at this offset on this
1113 * side than those of the sending peer, we may need to submit the
Lars Ellenberg9476f392011-02-23 17:02:01 +01001114 * request in more than one bio.
1115 *
1116 * Plain bio_alloc is good enough here, this is no DRBD internally
1117 * generated bio, but a bio allocated on behalf of the peer.
1118 */
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001119next_bio:
1120 bio = bio_alloc(GFP_NOIO, nr_pages);
1121 if (!bio) {
1122 dev_err(DEV, "submit_ee: Allocation of a bio failed\n");
1123 goto fail;
1124 }
1125 /* > e->sector, unless this is the first bio */
1126 bio->bi_sector = sector;
1127 bio->bi_bdev = mdev->ldev->backing_bdev;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001128 bio->bi_rw = rw;
1129 bio->bi_private = e;
1130 bio->bi_end_io = drbd_endio_sec;
1131
1132 bio->bi_next = bios;
1133 bios = bio;
1134 ++n_bios;
1135
1136 page_chain_for_each(page) {
1137 unsigned len = min_t(unsigned, ds, PAGE_SIZE);
1138 if (!bio_add_page(bio, page, len, 0)) {
Lars Ellenberg10f6d9922011-01-24 14:47:09 +01001139 /* A single page must always be possible!
1140 * But in case it fails anyways,
1141 * we deal with it, and complain (below). */
1142 if (bio->bi_vcnt == 0) {
1143 dev_err(DEV,
1144 "bio_add_page failed for len=%u, "
1145 "bi_vcnt=0 (bi_sector=%llu)\n",
1146 len, (unsigned long long)bio->bi_sector);
1147 err = -ENOSPC;
1148 goto fail;
1149 }
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001150 goto next_bio;
1151 }
1152 ds -= len;
1153 sector += len >> 9;
1154 --nr_pages;
1155 }
1156 D_ASSERT(page == NULL);
1157 D_ASSERT(ds == 0);
1158
1159 atomic_set(&e->pending_bios, n_bios);
1160 do {
1161 bio = bios;
1162 bios = bios->bi_next;
1163 bio->bi_next = NULL;
1164
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001165 drbd_generic_make_request(mdev, fault_type, bio);
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001166 } while (bios);
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001167 return 0;
1168
1169fail:
1170 while (bios) {
1171 bio = bios;
1172 bios = bios->bi_next;
1173 bio_put(bio);
1174 }
Lars Ellenberg10f6d9922011-01-24 14:47:09 +01001175 return err;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001176}
1177
Philipp Reisner02918be2010-08-20 14:35:10 +02001178static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001179{
Philipp Reisner2451fc32010-08-24 13:43:11 +02001180 int rv;
Philipp Reisner02918be2010-08-20 14:35:10 +02001181 struct p_barrier *p = &mdev->data.rbuf.barrier;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001182 struct drbd_epoch *epoch;
1183
Philipp Reisnerb411b362009-09-25 16:07:19 -07001184 inc_unacked(mdev);
1185
Philipp Reisnerb411b362009-09-25 16:07:19 -07001186 mdev->current_epoch->barrier_nr = p->barrier;
1187 rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR);
1188
1189 /* P_BARRIER_ACK may imply that the corresponding extent is dropped from
1190 * the activity log, which means it would not be resynced in case the
1191 * R_PRIMARY crashes now.
1192 * Therefore we must send the barrier_ack after the barrier request was
1193 * completed. */
1194 switch (mdev->write_ordering) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001195 case WO_none:
1196 if (rv == FE_RECYCLED)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001197 return true;
Philipp Reisner2451fc32010-08-24 13:43:11 +02001198
1199 /* receiver context, in the writeout path of the other node.
1200 * avoid potential distributed deadlock */
1201 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1202 if (epoch)
1203 break;
1204 else
1205 dev_warn(DEV, "Allocation of an epoch failed, slowing down\n");
1206 /* Fall through */
Philipp Reisnerb411b362009-09-25 16:07:19 -07001207
1208 case WO_bdev_flush:
1209 case WO_drain_io:
Philipp Reisnerb411b362009-09-25 16:07:19 -07001210 drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
Philipp Reisner2451fc32010-08-24 13:43:11 +02001211 drbd_flush(mdev);
1212
1213 if (atomic_read(&mdev->current_epoch->epoch_size)) {
1214 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1215 if (epoch)
1216 break;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001217 }
1218
Philipp Reisner2451fc32010-08-24 13:43:11 +02001219 epoch = mdev->current_epoch;
1220 wait_event(mdev->ee_wait, atomic_read(&epoch->epoch_size) == 0);
1221
1222 D_ASSERT(atomic_read(&epoch->active) == 0);
1223 D_ASSERT(epoch->flags == 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001224
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001225 return true;
Philipp Reisner2451fc32010-08-24 13:43:11 +02001226 default:
1227 dev_err(DEV, "Strangeness in mdev->write_ordering %d\n", mdev->write_ordering);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001228 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001229 }
1230
1231 epoch->flags = 0;
1232 atomic_set(&epoch->epoch_size, 0);
1233 atomic_set(&epoch->active, 0);
1234
1235 spin_lock(&mdev->epoch_lock);
1236 if (atomic_read(&mdev->current_epoch->epoch_size)) {
1237 list_add(&epoch->list, &mdev->current_epoch->list);
1238 mdev->current_epoch = epoch;
1239 mdev->epochs++;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001240 } else {
1241 /* The current_epoch got recycled while we allocated this one... */
1242 kfree(epoch);
1243 }
1244 spin_unlock(&mdev->epoch_lock);
1245
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001246 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001247}
1248
1249/* used from receive_RSDataReply (recv_resync_read)
1250 * and from receive_Data */
1251static struct drbd_epoch_entry *
1252read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local)
1253{
Lars Ellenberg66660322010-04-06 12:15:04 +02001254 const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001255 struct drbd_epoch_entry *e;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001256 struct page *page;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001257 int dgs, ds, rr;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001258 void *dig_in = mdev->int_dig_in;
1259 void *dig_vv = mdev->int_dig_vv;
Philipp Reisner6b4388a2010-04-26 14:11:45 +02001260 unsigned long *data;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001261
1262 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
1263 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
1264
1265 if (dgs) {
1266 rr = drbd_recv(mdev, dig_in, dgs);
1267 if (rr != dgs) {
Lars Ellenberg0ddc5542011-01-21 12:35:15 +01001268 if (!signal_pending(current))
1269 dev_warn(DEV,
1270 "short read receiving data digest: read %d expected %d\n",
1271 rr, dgs);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001272 return NULL;
1273 }
1274 }
1275
1276 data_size -= dgs;
1277
1278 ERR_IF(data_size & 0x1ff) return NULL;
Lars Ellenberg1816a2b2010-11-11 15:19:07 +01001279 ERR_IF(data_size > DRBD_MAX_BIO_SIZE) return NULL;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001280
Lars Ellenberg66660322010-04-06 12:15:04 +02001281 /* even though we trust out peer,
1282 * we sometimes have to double check. */
1283 if (sector + (data_size>>9) > capacity) {
Lars Ellenbergfdda6542011-01-24 15:11:01 +01001284 dev_err(DEV, "request from peer beyond end of local disk: "
1285 "capacity: %llus < sector: %llus + size: %u\n",
Lars Ellenberg66660322010-04-06 12:15:04 +02001286 (unsigned long long)capacity,
1287 (unsigned long long)sector, data_size);
1288 return NULL;
1289 }
1290
Philipp Reisnerb411b362009-09-25 16:07:19 -07001291 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
1292 * "criss-cross" setup, that might cause write-out on some other DRBD,
1293 * which in turn might block on the other node at this very place. */
1294 e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO);
1295 if (!e)
1296 return NULL;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001297
Lars Ellenberga73ff322012-06-25 19:15:38 +02001298 if (!data_size)
1299 return e;
1300
Philipp Reisnerb411b362009-09-25 16:07:19 -07001301 ds = data_size;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001302 page = e->pages;
1303 page_chain_for_each(page) {
1304 unsigned len = min_t(int, ds, PAGE_SIZE);
Philipp Reisner6b4388a2010-04-26 14:11:45 +02001305 data = kmap(page);
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001306 rr = drbd_recv(mdev, data, len);
Andreas Gruenbacher0cf9d272010-12-07 10:43:29 +01001307 if (drbd_insert_fault(mdev, DRBD_FAULT_RECEIVE)) {
Philipp Reisner6b4388a2010-04-26 14:11:45 +02001308 dev_err(DEV, "Fault injection: Corrupting data on receive\n");
1309 data[0] = data[0] ^ (unsigned long)-1;
1310 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001311 kunmap(page);
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001312 if (rr != len) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001313 drbd_free_ee(mdev, e);
Lars Ellenberg0ddc5542011-01-21 12:35:15 +01001314 if (!signal_pending(current))
1315 dev_warn(DEV, "short read receiving data: read %d expected %d\n",
1316 rr, len);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001317 return NULL;
1318 }
1319 ds -= rr;
1320 }
1321
1322 if (dgs) {
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001323 drbd_csum_ee(mdev, mdev->integrity_r_tfm, e, dig_vv);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001324 if (memcmp(dig_in, dig_vv, dgs)) {
Lars Ellenberg470be442010-11-10 10:36:52 +01001325 dev_err(DEV, "Digest integrity check FAILED: %llus +%u\n",
1326 (unsigned long long)sector, data_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001327 drbd_bcast_ee(mdev, "digest failed",
1328 dgs, dig_in, dig_vv, e);
1329 drbd_free_ee(mdev, e);
1330 return NULL;
1331 }
1332 }
1333 mdev->recv_cnt += data_size>>9;
1334 return e;
1335}
1336
1337/* drbd_drain_block() just takes a data block
1338 * out of the socket input buffer, and discards it.
1339 */
1340static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
1341{
1342 struct page *page;
1343 int rr, rv = 1;
1344 void *data;
1345
Lars Ellenbergc3470cd2010-04-01 16:57:19 +02001346 if (!data_size)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001347 return true;
Lars Ellenbergc3470cd2010-04-01 16:57:19 +02001348
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001349 page = drbd_pp_alloc(mdev, 1, 1);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001350
1351 data = kmap(page);
1352 while (data_size) {
1353 rr = drbd_recv(mdev, data, min_t(int, data_size, PAGE_SIZE));
1354 if (rr != min_t(int, data_size, PAGE_SIZE)) {
1355 rv = 0;
Lars Ellenberg0ddc5542011-01-21 12:35:15 +01001356 if (!signal_pending(current))
1357 dev_warn(DEV,
1358 "short read receiving data: read %d expected %d\n",
1359 rr, min_t(int, data_size, PAGE_SIZE));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001360 break;
1361 }
1362 data_size -= rr;
1363 }
1364 kunmap(page);
Lars Ellenberg435f0742010-09-06 12:30:25 +02001365 drbd_pp_free(mdev, page, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001366 return rv;
1367}
1368
1369static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
1370 sector_t sector, int data_size)
1371{
1372 struct bio_vec *bvec;
1373 struct bio *bio;
1374 int dgs, rr, i, expect;
1375 void *dig_in = mdev->int_dig_in;
1376 void *dig_vv = mdev->int_dig_vv;
1377
1378 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
1379 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
1380
1381 if (dgs) {
1382 rr = drbd_recv(mdev, dig_in, dgs);
1383 if (rr != dgs) {
Lars Ellenberg0ddc5542011-01-21 12:35:15 +01001384 if (!signal_pending(current))
1385 dev_warn(DEV,
1386 "short read receiving data reply digest: read %d expected %d\n",
1387 rr, dgs);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001388 return 0;
1389 }
1390 }
1391
1392 data_size -= dgs;
1393
1394 /* optimistically update recv_cnt. if receiving fails below,
1395 * we disconnect anyways, and counters will be reset. */
1396 mdev->recv_cnt += data_size>>9;
1397
1398 bio = req->master_bio;
1399 D_ASSERT(sector == bio->bi_sector);
1400
1401 bio_for_each_segment(bvec, bio, i) {
1402 expect = min_t(int, data_size, bvec->bv_len);
1403 rr = drbd_recv(mdev,
1404 kmap(bvec->bv_page)+bvec->bv_offset,
1405 expect);
1406 kunmap(bvec->bv_page);
1407 if (rr != expect) {
Lars Ellenberg0ddc5542011-01-21 12:35:15 +01001408 if (!signal_pending(current))
1409 dev_warn(DEV, "short read receiving data reply: "
1410 "read %d expected %d\n",
1411 rr, expect);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001412 return 0;
1413 }
1414 data_size -= rr;
1415 }
1416
1417 if (dgs) {
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001418 drbd_csum_bio(mdev, mdev->integrity_r_tfm, bio, dig_vv);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001419 if (memcmp(dig_in, dig_vv, dgs)) {
1420 dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n");
1421 return 0;
1422 }
1423 }
1424
1425 D_ASSERT(data_size == 0);
1426 return 1;
1427}
1428
1429/* e_end_resync_block() is called via
1430 * drbd_process_done_ee() by asender only */
1431static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1432{
1433 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1434 sector_t sector = e->sector;
1435 int ok;
1436
Bart Van Assche24c48302011-05-21 18:32:29 +02001437 D_ASSERT(hlist_unhashed(&e->collision));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001438
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001439 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001440 drbd_set_in_sync(mdev, sector, e->size);
1441 ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e);
1442 } else {
1443 /* Record failure to sync */
1444 drbd_rs_failed_io(mdev, sector, e->size);
1445
1446 ok = drbd_send_ack(mdev, P_NEG_ACK, e);
1447 }
1448 dec_unacked(mdev);
1449
1450 return ok;
1451}
1452
1453static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local)
1454{
1455 struct drbd_epoch_entry *e;
1456
1457 e = read_in_block(mdev, ID_SYNCER, sector, data_size);
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001458 if (!e)
1459 goto fail;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001460
1461 dec_rs_pending(mdev);
1462
Philipp Reisnerb411b362009-09-25 16:07:19 -07001463 inc_unacked(mdev);
1464 /* corresponding dec_unacked() in e_end_resync_block()
1465 * respective _drbd_clear_done_ee */
1466
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001467 e->w.cb = e_end_resync_block;
1468
Philipp Reisnerb411b362009-09-25 16:07:19 -07001469 spin_lock_irq(&mdev->req_lock);
1470 list_add(&e->w.list, &mdev->sync_ee);
1471 spin_unlock_irq(&mdev->req_lock);
1472
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02001473 atomic_add(data_size >> 9, &mdev->rs_sect_ev);
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001474 if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001475 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001476
Lars Ellenberg10f6d9922011-01-24 14:47:09 +01001477 /* don't care for the reason here */
1478 dev_err(DEV, "submit failed, triggering re-connect\n");
Lars Ellenberg22cc37a2010-09-14 20:40:41 +02001479 spin_lock_irq(&mdev->req_lock);
1480 list_del(&e->w.list);
1481 spin_unlock_irq(&mdev->req_lock);
1482
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001483 drbd_free_ee(mdev, e);
1484fail:
1485 put_ldev(mdev);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001486 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001487}
1488
Philipp Reisner02918be2010-08-20 14:35:10 +02001489static int receive_DataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001490{
1491 struct drbd_request *req;
1492 sector_t sector;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001493 int ok;
Philipp Reisner02918be2010-08-20 14:35:10 +02001494 struct p_data *p = &mdev->data.rbuf.data;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001495
1496 sector = be64_to_cpu(p->sector);
1497
1498 spin_lock_irq(&mdev->req_lock);
1499 req = _ar_id_to_req(mdev, p->block_id, sector);
1500 spin_unlock_irq(&mdev->req_lock);
1501 if (unlikely(!req)) {
1502 dev_err(DEV, "Got a corrupt block_id/sector pair(1).\n");
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001503 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001504 }
1505
Bart Van Assche24c48302011-05-21 18:32:29 +02001506 /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid
Philipp Reisnerb411b362009-09-25 16:07:19 -07001507 * special casing it there for the various failure cases.
1508 * still no race with drbd_fail_pending_reads */
1509 ok = recv_dless_read(mdev, req, sector, data_size);
1510
1511 if (ok)
1512 req_mod(req, data_received);
1513 /* else: nothing. handled from drbd_disconnect...
1514 * I don't think we may complete this just yet
1515 * in case we are "on-disconnect: freeze" */
1516
1517 return ok;
1518}
1519
Philipp Reisner02918be2010-08-20 14:35:10 +02001520static int receive_RSDataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001521{
1522 sector_t sector;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001523 int ok;
Philipp Reisner02918be2010-08-20 14:35:10 +02001524 struct p_data *p = &mdev->data.rbuf.data;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001525
1526 sector = be64_to_cpu(p->sector);
1527 D_ASSERT(p->block_id == ID_SYNCER);
1528
1529 if (get_ldev(mdev)) {
1530 /* data is submitted to disk within recv_resync_read.
1531 * corresponding put_ldev done below on error,
1532 * or in drbd_endio_write_sec. */
1533 ok = recv_resync_read(mdev, sector, data_size);
1534 } else {
1535 if (__ratelimit(&drbd_ratelimit_state))
1536 dev_err(DEV, "Can not write resync data to local disk.\n");
1537
1538 ok = drbd_drain_block(mdev, data_size);
1539
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02001540 drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001541 }
1542
Philipp Reisner778f2712010-07-06 11:14:00 +02001543 atomic_add(data_size >> 9, &mdev->rs_sect_in);
1544
Philipp Reisnerb411b362009-09-25 16:07:19 -07001545 return ok;
1546}
1547
1548/* e_end_block() is called via drbd_process_done_ee().
1549 * this means this function only runs in the asender thread
1550 */
1551static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1552{
1553 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1554 sector_t sector = e->sector;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001555 int ok = 1, pcmd;
1556
Philipp Reisnerb411b362009-09-25 16:07:19 -07001557 if (mdev->net_conf->wire_protocol == DRBD_PROT_C) {
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001558 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001559 pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
1560 mdev->state.conn <= C_PAUSED_SYNC_T &&
1561 e->flags & EE_MAY_SET_IN_SYNC) ?
1562 P_RS_WRITE_ACK : P_WRITE_ACK;
1563 ok &= drbd_send_ack(mdev, pcmd, e);
1564 if (pcmd == P_RS_WRITE_ACK)
1565 drbd_set_in_sync(mdev, sector, e->size);
1566 } else {
1567 ok = drbd_send_ack(mdev, P_NEG_ACK, e);
1568 /* we expect it to be marked out of sync anyways...
1569 * maybe assert this? */
1570 }
1571 dec_unacked(mdev);
1572 }
1573 /* we delete from the conflict detection hash _after_ we sent out the
1574 * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */
1575 if (mdev->net_conf->two_primaries) {
1576 spin_lock_irq(&mdev->req_lock);
Bart Van Assche24c48302011-05-21 18:32:29 +02001577 D_ASSERT(!hlist_unhashed(&e->collision));
1578 hlist_del_init(&e->collision);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001579 spin_unlock_irq(&mdev->req_lock);
1580 } else {
Bart Van Assche24c48302011-05-21 18:32:29 +02001581 D_ASSERT(hlist_unhashed(&e->collision));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001582 }
1583
1584 drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
1585
1586 return ok;
1587}
1588
1589static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1590{
1591 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1592 int ok = 1;
1593
1594 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
1595 ok = drbd_send_ack(mdev, P_DISCARD_ACK, e);
1596
1597 spin_lock_irq(&mdev->req_lock);
Bart Van Assche24c48302011-05-21 18:32:29 +02001598 D_ASSERT(!hlist_unhashed(&e->collision));
1599 hlist_del_init(&e->collision);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001600 spin_unlock_irq(&mdev->req_lock);
1601
1602 dec_unacked(mdev);
1603
1604 return ok;
1605}
1606
Philipp Reisnerb6a370ba2012-02-19 01:27:53 +01001607static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_epoch_entry *data_e)
1608{
1609
1610 struct drbd_epoch_entry *rs_e;
1611 bool rv = 0;
1612
1613 spin_lock_irq(&mdev->req_lock);
1614 list_for_each_entry(rs_e, &mdev->sync_ee, w.list) {
1615 if (overlaps(data_e->sector, data_e->size, rs_e->sector, rs_e->size)) {
1616 rv = 1;
1617 break;
1618 }
1619 }
1620 spin_unlock_irq(&mdev->req_lock);
1621
1622 return rv;
1623}
1624
Philipp Reisnerb411b362009-09-25 16:07:19 -07001625/* Called from receive_Data.
1626 * Synchronize packets on sock with packets on msock.
1627 *
1628 * This is here so even when a P_DATA packet traveling via sock overtook an Ack
1629 * packet traveling on msock, they are still processed in the order they have
1630 * been sent.
1631 *
1632 * Note: we don't care for Ack packets overtaking P_DATA packets.
1633 *
1634 * In case packet_seq is larger than mdev->peer_seq number, there are
1635 * outstanding packets on the msock. We wait for them to arrive.
1636 * In case we are the logically next packet, we update mdev->peer_seq
1637 * ourselves. Correctly handles 32bit wrap around.
1638 *
1639 * Assume we have a 10 GBit connection, that is about 1<<30 byte per second,
1640 * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds
1641 * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have
1642 * 1<<9 == 512 seconds aka ages for the 32bit wrap around...
1643 *
1644 * returns 0 if we may process the packet,
1645 * -ERESTARTSYS if we were interrupted (by disconnect signal). */
1646static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq)
1647{
1648 DEFINE_WAIT(wait);
1649 unsigned int p_seq;
1650 long timeout;
1651 int ret = 0;
1652 spin_lock(&mdev->peer_seq_lock);
1653 for (;;) {
1654 prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE);
1655 if (seq_le(packet_seq, mdev->peer_seq+1))
1656 break;
1657 if (signal_pending(current)) {
1658 ret = -ERESTARTSYS;
1659 break;
1660 }
1661 p_seq = mdev->peer_seq;
1662 spin_unlock(&mdev->peer_seq_lock);
1663 timeout = schedule_timeout(30*HZ);
1664 spin_lock(&mdev->peer_seq_lock);
1665 if (timeout == 0 && p_seq == mdev->peer_seq) {
1666 ret = -ETIMEDOUT;
1667 dev_err(DEV, "ASSERT FAILED waited 30 seconds for sequence update, forcing reconnect\n");
1668 break;
1669 }
1670 }
1671 finish_wait(&mdev->seq_wait, &wait);
1672 if (mdev->peer_seq+1 == packet_seq)
1673 mdev->peer_seq++;
1674 spin_unlock(&mdev->peer_seq_lock);
1675 return ret;
1676}
1677
Lars Ellenberg688593c2010-11-17 22:25:03 +01001678/* see also bio_flags_to_wire()
1679 * DRBD_REQ_*, because we need to semantically map the flags to data packet
1680 * flags and back. We may replicate to other kernel versions. */
1681static unsigned long wire_flags_to_bio(struct drbd_conf *mdev, u32 dpf)
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02001682{
Lars Ellenberg688593c2010-11-17 22:25:03 +01001683 return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
1684 (dpf & DP_FUA ? REQ_FUA : 0) |
1685 (dpf & DP_FLUSH ? REQ_FLUSH : 0) |
1686 (dpf & DP_DISCARD ? REQ_DISCARD : 0);
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02001687}
1688
Philipp Reisnerb411b362009-09-25 16:07:19 -07001689/* mirrored write */
Philipp Reisner02918be2010-08-20 14:35:10 +02001690static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001691{
1692 sector_t sector;
1693 struct drbd_epoch_entry *e;
Philipp Reisner02918be2010-08-20 14:35:10 +02001694 struct p_data *p = &mdev->data.rbuf.data;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001695 int rw = WRITE;
1696 u32 dp_flags;
1697
Philipp Reisnerb411b362009-09-25 16:07:19 -07001698 if (!get_ldev(mdev)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001699 spin_lock(&mdev->peer_seq_lock);
1700 if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num))
1701 mdev->peer_seq++;
1702 spin_unlock(&mdev->peer_seq_lock);
1703
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02001704 drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001705 atomic_inc(&mdev->current_epoch->epoch_size);
1706 return drbd_drain_block(mdev, data_size);
1707 }
1708
1709 /* get_ldev(mdev) successful.
1710 * Corresponding put_ldev done either below (on various errors),
1711 * or in drbd_endio_write_sec, if we successfully submit the data at
1712 * the end of this function. */
1713
1714 sector = be64_to_cpu(p->sector);
1715 e = read_in_block(mdev, p->block_id, sector, data_size);
1716 if (!e) {
1717 put_ldev(mdev);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001718 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001719 }
1720
Philipp Reisnerb411b362009-09-25 16:07:19 -07001721 e->w.cb = e_end_block;
1722
Lars Ellenberg688593c2010-11-17 22:25:03 +01001723 dp_flags = be32_to_cpu(p->dp_flags);
1724 rw |= wire_flags_to_bio(mdev, dp_flags);
Lars Ellenberga73ff322012-06-25 19:15:38 +02001725 if (e->pages == NULL) {
1726 D_ASSERT(e->size == 0);
1727 D_ASSERT(dp_flags & DP_FLUSH);
1728 }
Lars Ellenberg688593c2010-11-17 22:25:03 +01001729
1730 if (dp_flags & DP_MAY_SET_IN_SYNC)
1731 e->flags |= EE_MAY_SET_IN_SYNC;
1732
Philipp Reisnerb411b362009-09-25 16:07:19 -07001733 spin_lock(&mdev->epoch_lock);
1734 e->epoch = mdev->current_epoch;
1735 atomic_inc(&e->epoch->epoch_size);
1736 atomic_inc(&e->epoch->active);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001737 spin_unlock(&mdev->epoch_lock);
1738
Philipp Reisnerb411b362009-09-25 16:07:19 -07001739 /* I'm the receiver, I do hold a net_cnt reference. */
1740 if (!mdev->net_conf->two_primaries) {
1741 spin_lock_irq(&mdev->req_lock);
1742 } else {
1743 /* don't get the req_lock yet,
1744 * we may sleep in drbd_wait_peer_seq */
1745 const int size = e->size;
1746 const int discard = test_bit(DISCARD_CONCURRENT, &mdev->flags);
1747 DEFINE_WAIT(wait);
1748 struct drbd_request *i;
1749 struct hlist_node *n;
1750 struct hlist_head *slot;
1751 int first;
1752
1753 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
1754 BUG_ON(mdev->ee_hash == NULL);
1755 BUG_ON(mdev->tl_hash == NULL);
1756
1757 /* conflict detection and handling:
1758 * 1. wait on the sequence number,
1759 * in case this data packet overtook ACK packets.
1760 * 2. check our hash tables for conflicting requests.
1761 * we only need to walk the tl_hash, since an ee can not
1762 * have a conflict with an other ee: on the submitting
1763 * node, the corresponding req had already been conflicting,
1764 * and a conflicting req is never sent.
1765 *
1766 * Note: for two_primaries, we are protocol C,
1767 * so there cannot be any request that is DONE
1768 * but still on the transfer log.
1769 *
1770 * unconditionally add to the ee_hash.
1771 *
1772 * if no conflicting request is found:
1773 * submit.
1774 *
1775 * if any conflicting request is found
1776 * that has not yet been acked,
1777 * AND I have the "discard concurrent writes" flag:
1778 * queue (via done_ee) the P_DISCARD_ACK; OUT.
1779 *
1780 * if any conflicting request is found:
1781 * block the receiver, waiting on misc_wait
1782 * until no more conflicting requests are there,
1783 * or we get interrupted (disconnect).
1784 *
1785 * we do not just write after local io completion of those
1786 * requests, but only after req is done completely, i.e.
1787 * we wait for the P_DISCARD_ACK to arrive!
1788 *
1789 * then proceed normally, i.e. submit.
1790 */
1791 if (drbd_wait_peer_seq(mdev, be32_to_cpu(p->seq_num)))
1792 goto out_interrupted;
1793
1794 spin_lock_irq(&mdev->req_lock);
1795
Bart Van Assche24c48302011-05-21 18:32:29 +02001796 hlist_add_head(&e->collision, ee_hash_slot(mdev, sector));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001797
1798#define OVERLAPS overlaps(i->sector, i->size, sector, size)
1799 slot = tl_hash_slot(mdev, sector);
1800 first = 1;
1801 for (;;) {
1802 int have_unacked = 0;
1803 int have_conflict = 0;
1804 prepare_to_wait(&mdev->misc_wait, &wait,
1805 TASK_INTERRUPTIBLE);
Bart Van Assche24c48302011-05-21 18:32:29 +02001806 hlist_for_each_entry(i, n, slot, collision) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001807 if (OVERLAPS) {
1808 /* only ALERT on first iteration,
1809 * we may be woken up early... */
1810 if (first)
1811 dev_alert(DEV, "%s[%u] Concurrent local write detected!"
1812 " new: %llus +%u; pending: %llus +%u\n",
1813 current->comm, current->pid,
1814 (unsigned long long)sector, size,
1815 (unsigned long long)i->sector, i->size);
1816 if (i->rq_state & RQ_NET_PENDING)
1817 ++have_unacked;
1818 ++have_conflict;
1819 }
1820 }
1821#undef OVERLAPS
1822 if (!have_conflict)
1823 break;
1824
1825 /* Discard Ack only for the _first_ iteration */
1826 if (first && discard && have_unacked) {
1827 dev_alert(DEV, "Concurrent write! [DISCARD BY FLAG] sec=%llus\n",
1828 (unsigned long long)sector);
1829 inc_unacked(mdev);
1830 e->w.cb = e_send_discard_ack;
1831 list_add_tail(&e->w.list, &mdev->done_ee);
1832
1833 spin_unlock_irq(&mdev->req_lock);
1834
1835 /* we could probably send that P_DISCARD_ACK ourselves,
1836 * but I don't like the receiver using the msock */
1837
1838 put_ldev(mdev);
1839 wake_asender(mdev);
1840 finish_wait(&mdev->misc_wait, &wait);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001841 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001842 }
1843
1844 if (signal_pending(current)) {
Bart Van Assche24c48302011-05-21 18:32:29 +02001845 hlist_del_init(&e->collision);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001846
1847 spin_unlock_irq(&mdev->req_lock);
1848
1849 finish_wait(&mdev->misc_wait, &wait);
1850 goto out_interrupted;
1851 }
1852
1853 spin_unlock_irq(&mdev->req_lock);
1854 if (first) {
1855 first = 0;
1856 dev_alert(DEV, "Concurrent write! [W AFTERWARDS] "
1857 "sec=%llus\n", (unsigned long long)sector);
1858 } else if (discard) {
1859 /* we had none on the first iteration.
1860 * there must be none now. */
1861 D_ASSERT(have_unacked == 0);
1862 }
1863 schedule();
1864 spin_lock_irq(&mdev->req_lock);
1865 }
1866 finish_wait(&mdev->misc_wait, &wait);
1867 }
1868
1869 list_add(&e->w.list, &mdev->active_ee);
1870 spin_unlock_irq(&mdev->req_lock);
1871
Philipp Reisnerb6a370ba2012-02-19 01:27:53 +01001872 if (mdev->state.conn == C_SYNC_TARGET)
1873 wait_event(mdev->ee_wait, !overlapping_resync_write(mdev, e));
1874
Philipp Reisnerb411b362009-09-25 16:07:19 -07001875 switch (mdev->net_conf->wire_protocol) {
1876 case DRBD_PROT_C:
1877 inc_unacked(mdev);
1878 /* corresponding dec_unacked() in e_end_block()
1879 * respective _drbd_clear_done_ee */
1880 break;
1881 case DRBD_PROT_B:
1882 /* I really don't like it that the receiver thread
1883 * sends on the msock, but anyways */
1884 drbd_send_ack(mdev, P_RECV_ACK, e);
1885 break;
1886 case DRBD_PROT_A:
1887 /* nothing to do */
1888 break;
1889 }
1890
Lars Ellenberg6719fb02010-10-18 23:04:07 +02001891 if (mdev->state.pdsk < D_INCONSISTENT) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001892 /* In case we have the only disk of the cluster, */
1893 drbd_set_out_of_sync(mdev, e->sector, e->size);
1894 e->flags |= EE_CALL_AL_COMPLETE_IO;
Lars Ellenberg6719fb02010-10-18 23:04:07 +02001895 e->flags &= ~EE_MAY_SET_IN_SYNC;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001896 drbd_al_begin_io(mdev, e->sector);
1897 }
1898
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001899 if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001900 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001901
Lars Ellenberg10f6d9922011-01-24 14:47:09 +01001902 /* don't care for the reason here */
1903 dev_err(DEV, "submit failed, triggering re-connect\n");
Lars Ellenberg22cc37a2010-09-14 20:40:41 +02001904 spin_lock_irq(&mdev->req_lock);
1905 list_del(&e->w.list);
Bart Van Assche24c48302011-05-21 18:32:29 +02001906 hlist_del_init(&e->collision);
Lars Ellenberg22cc37a2010-09-14 20:40:41 +02001907 spin_unlock_irq(&mdev->req_lock);
1908 if (e->flags & EE_CALL_AL_COMPLETE_IO)
1909 drbd_al_complete_io(mdev, e->sector);
1910
Philipp Reisnerb411b362009-09-25 16:07:19 -07001911out_interrupted:
Lars Ellenberg10f6d9922011-01-24 14:47:09 +01001912 drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + EV_CLEANUP);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001913 put_ldev(mdev);
1914 drbd_free_ee(mdev, e);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001915 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001916}
1917
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02001918/* We may throttle resync, if the lower device seems to be busy,
1919 * and current sync rate is above c_min_rate.
1920 *
1921 * To decide whether or not the lower device is busy, we use a scheme similar
1922 * to MD RAID is_mddev_idle(): if the partition stats reveal "significant"
1923 * (more than 64 sectors) of activity we cannot account for with our own resync
1924 * activity, it obviously is "busy".
1925 *
1926 * The current sync rate used here uses only the most recent two step marks,
1927 * to have a short time average so we can react faster.
1928 */
Philipp Reisnere3555d82010-11-07 15:56:29 +01001929int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector)
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02001930{
1931 struct gendisk *disk = mdev->ldev->backing_bdev->bd_contains->bd_disk;
1932 unsigned long db, dt, dbdt;
Philipp Reisnere3555d82010-11-07 15:56:29 +01001933 struct lc_element *tmp;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02001934 int curr_events;
1935 int throttle = 0;
1936
1937 /* feature disabled? */
1938 if (mdev->sync_conf.c_min_rate == 0)
1939 return 0;
1940
Philipp Reisnere3555d82010-11-07 15:56:29 +01001941 spin_lock_irq(&mdev->al_lock);
1942 tmp = lc_find(mdev->resync, BM_SECT_TO_EXT(sector));
1943 if (tmp) {
1944 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
1945 if (test_bit(BME_PRIORITY, &bm_ext->flags)) {
1946 spin_unlock_irq(&mdev->al_lock);
1947 return 0;
1948 }
1949 /* Do not slow down if app IO is already waiting for this extent */
1950 }
1951 spin_unlock_irq(&mdev->al_lock);
1952
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02001953 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
1954 (int)part_stat_read(&disk->part0, sectors[1]) -
1955 atomic_read(&mdev->rs_sect_ev);
Philipp Reisnere3555d82010-11-07 15:56:29 +01001956
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02001957 if (!mdev->rs_last_events || curr_events - mdev->rs_last_events > 64) {
1958 unsigned long rs_left;
1959 int i;
1960
1961 mdev->rs_last_events = curr_events;
1962
1963 /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
1964 * approx. */
Lars Ellenberg2649f082010-11-05 10:05:47 +01001965 i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
1966
1967 if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
1968 rs_left = mdev->ov_left;
1969 else
1970 rs_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02001971
1972 dt = ((long)jiffies - (long)mdev->rs_mark_time[i]) / HZ;
1973 if (!dt)
1974 dt++;
1975 db = mdev->rs_mark_left[i] - rs_left;
1976 dbdt = Bit2KB(db/dt);
1977
1978 if (dbdt > mdev->sync_conf.c_min_rate)
1979 throttle = 1;
1980 }
1981 return throttle;
1982}
1983
1984
Philipp Reisner02918be2010-08-20 14:35:10 +02001985static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int digest_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001986{
1987 sector_t sector;
1988 const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
1989 struct drbd_epoch_entry *e;
1990 struct digest_info *di = NULL;
Philipp Reisnerb18b37b2010-10-13 15:32:44 +02001991 int size, verb;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001992 unsigned int fault_type;
Philipp Reisner02918be2010-08-20 14:35:10 +02001993 struct p_block_req *p = &mdev->data.rbuf.block_req;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001994
1995 sector = be64_to_cpu(p->sector);
1996 size = be32_to_cpu(p->blksize);
1997
Lars Ellenberg1816a2b2010-11-11 15:19:07 +01001998 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001999 dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
2000 (unsigned long long)sector, size);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002001 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002002 }
2003 if (sector + (size>>9) > capacity) {
2004 dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
2005 (unsigned long long)sector, size);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002006 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002007 }
2008
2009 if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) {
Philipp Reisnerb18b37b2010-10-13 15:32:44 +02002010 verb = 1;
2011 switch (cmd) {
2012 case P_DATA_REQUEST:
2013 drbd_send_ack_rp(mdev, P_NEG_DREPLY, p);
2014 break;
2015 case P_RS_DATA_REQUEST:
2016 case P_CSUM_RS_REQUEST:
2017 case P_OV_REQUEST:
2018 drbd_send_ack_rp(mdev, P_NEG_RS_DREPLY , p);
2019 break;
2020 case P_OV_REPLY:
2021 verb = 0;
2022 dec_rs_pending(mdev);
2023 drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, ID_IN_SYNC);
2024 break;
2025 default:
2026 dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
2027 cmdname(cmd));
2028 }
2029 if (verb && __ratelimit(&drbd_ratelimit_state))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002030 dev_err(DEV, "Can not satisfy peer's read request, "
2031 "no local data.\n");
Philipp Reisnerb18b37b2010-10-13 15:32:44 +02002032
Lars Ellenberga821cc42010-09-06 12:31:37 +02002033 /* drain possibly payload */
2034 return drbd_drain_block(mdev, digest_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002035 }
2036
2037 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
2038 * "criss-cross" setup, that might cause write-out on some other DRBD,
2039 * which in turn might block on the other node at this very place. */
2040 e = drbd_alloc_ee(mdev, p->block_id, sector, size, GFP_NOIO);
2041 if (!e) {
2042 put_ldev(mdev);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002043 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002044 }
2045
Philipp Reisner02918be2010-08-20 14:35:10 +02002046 switch (cmd) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07002047 case P_DATA_REQUEST:
2048 e->w.cb = w_e_end_data_req;
2049 fault_type = DRBD_FAULT_DT_RD;
Lars Ellenberg80a40e42010-08-11 23:28:00 +02002050 /* application IO, don't drbd_rs_begin_io */
2051 goto submit;
2052
Philipp Reisnerb411b362009-09-25 16:07:19 -07002053 case P_RS_DATA_REQUEST:
2054 e->w.cb = w_e_end_rsdata_req;
2055 fault_type = DRBD_FAULT_RS_RD;
Lars Ellenberg5f9915b2010-11-09 14:15:24 +01002056 /* used in the sector offset progress display */
2057 mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002058 break;
2059
2060 case P_OV_REPLY:
2061 case P_CSUM_RS_REQUEST:
2062 fault_type = DRBD_FAULT_RS_RD;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002063 di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO);
2064 if (!di)
2065 goto out_free_e;
2066
2067 di->digest_size = digest_size;
2068 di->digest = (((char *)di)+sizeof(struct digest_info));
2069
Lars Ellenbergc36c3ce2010-08-11 20:42:55 +02002070 e->digest = di;
2071 e->flags |= EE_HAS_DIGEST;
2072
Philipp Reisnerb411b362009-09-25 16:07:19 -07002073 if (drbd_recv(mdev, di->digest, digest_size) != digest_size)
2074 goto out_free_e;
2075
Philipp Reisner02918be2010-08-20 14:35:10 +02002076 if (cmd == P_CSUM_RS_REQUEST) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07002077 D_ASSERT(mdev->agreed_pro_version >= 89);
2078 e->w.cb = w_e_end_csum_rs_req;
Lars Ellenberg5f9915b2010-11-09 14:15:24 +01002079 /* used in the sector offset progress display */
2080 mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
Philipp Reisner02918be2010-08-20 14:35:10 +02002081 } else if (cmd == P_OV_REPLY) {
Lars Ellenberg2649f082010-11-05 10:05:47 +01002082 /* track progress, we may need to throttle */
2083 atomic_add(size >> 9, &mdev->rs_sect_in);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002084 e->w.cb = w_e_end_ov_reply;
2085 dec_rs_pending(mdev);
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02002086 /* drbd_rs_begin_io done when we sent this request,
2087 * but accounting still needs to be done. */
2088 goto submit_for_resync;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002089 }
2090 break;
2091
2092 case P_OV_REQUEST:
Philipp Reisnerb411b362009-09-25 16:07:19 -07002093 if (mdev->ov_start_sector == ~(sector_t)0 &&
2094 mdev->agreed_pro_version >= 90) {
Lars Ellenbergde228bb2010-11-05 09:43:15 +01002095 unsigned long now = jiffies;
2096 int i;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002097 mdev->ov_start_sector = sector;
2098 mdev->ov_position = sector;
Lars Ellenberg30b743a2010-11-05 09:39:06 +01002099 mdev->ov_left = drbd_bm_bits(mdev) - BM_SECT_TO_BIT(sector);
2100 mdev->rs_total = mdev->ov_left;
Lars Ellenbergde228bb2010-11-05 09:43:15 +01002101 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
2102 mdev->rs_mark_left[i] = mdev->ov_left;
2103 mdev->rs_mark_time[i] = now;
2104 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002105 dev_info(DEV, "Online Verify start sector: %llu\n",
2106 (unsigned long long)sector);
2107 }
2108 e->w.cb = w_e_end_ov_req;
2109 fault_type = DRBD_FAULT_RS_RD;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002110 break;
2111
Philipp Reisnerb411b362009-09-25 16:07:19 -07002112 default:
2113 dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
Philipp Reisner02918be2010-08-20 14:35:10 +02002114 cmdname(cmd));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002115 fault_type = DRBD_FAULT_MAX;
Lars Ellenberg80a40e42010-08-11 23:28:00 +02002116 goto out_free_e;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002117 }
2118
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02002119 /* Throttle, drbd_rs_begin_io and submit should become asynchronous
2120 * wrt the receiver, but it is not as straightforward as it may seem.
2121 * Various places in the resync start and stop logic assume resync
2122 * requests are processed in order, requeuing this on the worker thread
2123 * introduces a bunch of new code for synchronization between threads.
2124 *
2125 * Unlimited throttling before drbd_rs_begin_io may stall the resync
2126 * "forever", throttling after drbd_rs_begin_io will lock that extent
2127 * for application writes for the same time. For now, just throttle
2128 * here, where the rest of the code expects the receiver to sleep for
2129 * a while, anyways.
2130 */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002131
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02002132 /* Throttle before drbd_rs_begin_io, as that locks out application IO;
2133 * this defers syncer requests for some time, before letting at least
2134 * on request through. The resync controller on the receiving side
2135 * will adapt to the incoming rate accordingly.
2136 *
2137 * We cannot throttle here if remote is Primary/SyncTarget:
2138 * we would also throttle its application reads.
2139 * In that case, throttling is done on the SyncTarget only.
2140 */
Philipp Reisnere3555d82010-11-07 15:56:29 +01002141 if (mdev->state.peer != R_PRIMARY && drbd_rs_should_slow_down(mdev, sector))
2142 schedule_timeout_uninterruptible(HZ/10);
2143 if (drbd_rs_begin_io(mdev, sector))
Lars Ellenberg80a40e42010-08-11 23:28:00 +02002144 goto out_free_e;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002145
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02002146submit_for_resync:
2147 atomic_add(size >> 9, &mdev->rs_sect_ev);
2148
Lars Ellenberg80a40e42010-08-11 23:28:00 +02002149submit:
Philipp Reisnerb411b362009-09-25 16:07:19 -07002150 inc_unacked(mdev);
Lars Ellenberg80a40e42010-08-11 23:28:00 +02002151 spin_lock_irq(&mdev->req_lock);
2152 list_add_tail(&e->w.list, &mdev->read_ee);
2153 spin_unlock_irq(&mdev->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002154
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002155 if (drbd_submit_ee(mdev, e, READ, fault_type) == 0)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002156 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002157
Lars Ellenberg10f6d9922011-01-24 14:47:09 +01002158 /* don't care for the reason here */
2159 dev_err(DEV, "submit failed, triggering re-connect\n");
Lars Ellenberg22cc37a2010-09-14 20:40:41 +02002160 spin_lock_irq(&mdev->req_lock);
2161 list_del(&e->w.list);
2162 spin_unlock_irq(&mdev->req_lock);
2163 /* no drbd_rs_complete_io(), we are dropping the connection anyways */
2164
Philipp Reisnerb411b362009-09-25 16:07:19 -07002165out_free_e:
Philipp Reisnerb411b362009-09-25 16:07:19 -07002166 put_ldev(mdev);
2167 drbd_free_ee(mdev, e);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002168 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002169}
2170
2171static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
2172{
2173 int self, peer, rv = -100;
2174 unsigned long ch_self, ch_peer;
2175
2176 self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
2177 peer = mdev->p_uuid[UI_BITMAP] & 1;
2178
2179 ch_peer = mdev->p_uuid[UI_SIZE];
2180 ch_self = mdev->comm_bm_set;
2181
2182 switch (mdev->net_conf->after_sb_0p) {
2183 case ASB_CONSENSUS:
2184 case ASB_DISCARD_SECONDARY:
2185 case ASB_CALL_HELPER:
2186 dev_err(DEV, "Configuration error.\n");
2187 break;
2188 case ASB_DISCONNECT:
2189 break;
2190 case ASB_DISCARD_YOUNGER_PRI:
2191 if (self == 0 && peer == 1) {
2192 rv = -1;
2193 break;
2194 }
2195 if (self == 1 && peer == 0) {
2196 rv = 1;
2197 break;
2198 }
2199 /* Else fall through to one of the other strategies... */
2200 case ASB_DISCARD_OLDER_PRI:
2201 if (self == 0 && peer == 1) {
2202 rv = 1;
2203 break;
2204 }
2205 if (self == 1 && peer == 0) {
2206 rv = -1;
2207 break;
2208 }
2209 /* Else fall through to one of the other strategies... */
Lars Ellenbergad19bf62009-10-14 09:36:49 +02002210 dev_warn(DEV, "Discard younger/older primary did not find a decision\n"
Philipp Reisnerb411b362009-09-25 16:07:19 -07002211 "Using discard-least-changes instead\n");
2212 case ASB_DISCARD_ZERO_CHG:
2213 if (ch_peer == 0 && ch_self == 0) {
2214 rv = test_bit(DISCARD_CONCURRENT, &mdev->flags)
2215 ? -1 : 1;
2216 break;
2217 } else {
2218 if (ch_peer == 0) { rv = 1; break; }
2219 if (ch_self == 0) { rv = -1; break; }
2220 }
2221 if (mdev->net_conf->after_sb_0p == ASB_DISCARD_ZERO_CHG)
2222 break;
2223 case ASB_DISCARD_LEAST_CHG:
2224 if (ch_self < ch_peer)
2225 rv = -1;
2226 else if (ch_self > ch_peer)
2227 rv = 1;
2228 else /* ( ch_self == ch_peer ) */
2229 /* Well, then use something else. */
2230 rv = test_bit(DISCARD_CONCURRENT, &mdev->flags)
2231 ? -1 : 1;
2232 break;
2233 case ASB_DISCARD_LOCAL:
2234 rv = -1;
2235 break;
2236 case ASB_DISCARD_REMOTE:
2237 rv = 1;
2238 }
2239
2240 return rv;
2241}
2242
2243static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
2244{
Andreas Gruenbacher6184ea22010-12-09 14:23:27 +01002245 int hg, rv = -100;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002246
2247 switch (mdev->net_conf->after_sb_1p) {
2248 case ASB_DISCARD_YOUNGER_PRI:
2249 case ASB_DISCARD_OLDER_PRI:
2250 case ASB_DISCARD_LEAST_CHG:
2251 case ASB_DISCARD_LOCAL:
2252 case ASB_DISCARD_REMOTE:
2253 dev_err(DEV, "Configuration error.\n");
2254 break;
2255 case ASB_DISCONNECT:
2256 break;
2257 case ASB_CONSENSUS:
2258 hg = drbd_asb_recover_0p(mdev);
2259 if (hg == -1 && mdev->state.role == R_SECONDARY)
2260 rv = hg;
2261 if (hg == 1 && mdev->state.role == R_PRIMARY)
2262 rv = hg;
2263 break;
2264 case ASB_VIOLENTLY:
2265 rv = drbd_asb_recover_0p(mdev);
2266 break;
2267 case ASB_DISCARD_SECONDARY:
2268 return mdev->state.role == R_PRIMARY ? 1 : -1;
2269 case ASB_CALL_HELPER:
2270 hg = drbd_asb_recover_0p(mdev);
2271 if (hg == -1 && mdev->state.role == R_PRIMARY) {
Andreas Gruenbacherbb437942010-12-09 14:02:35 +01002272 enum drbd_state_rv rv2;
2273
2274 drbd_set_role(mdev, R_SECONDARY, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002275 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2276 * we might be here in C_WF_REPORT_PARAMS which is transient.
2277 * we do not need to wait for the after state change work either. */
Andreas Gruenbacherbb437942010-12-09 14:02:35 +01002278 rv2 = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
2279 if (rv2 != SS_SUCCESS) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07002280 drbd_khelper(mdev, "pri-lost-after-sb");
2281 } else {
2282 dev_warn(DEV, "Successfully gave up primary role.\n");
2283 rv = hg;
2284 }
2285 } else
2286 rv = hg;
2287 }
2288
2289 return rv;
2290}
2291
2292static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local)
2293{
Andreas Gruenbacher6184ea22010-12-09 14:23:27 +01002294 int hg, rv = -100;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002295
2296 switch (mdev->net_conf->after_sb_2p) {
2297 case ASB_DISCARD_YOUNGER_PRI:
2298 case ASB_DISCARD_OLDER_PRI:
2299 case ASB_DISCARD_LEAST_CHG:
2300 case ASB_DISCARD_LOCAL:
2301 case ASB_DISCARD_REMOTE:
2302 case ASB_CONSENSUS:
2303 case ASB_DISCARD_SECONDARY:
2304 dev_err(DEV, "Configuration error.\n");
2305 break;
2306 case ASB_VIOLENTLY:
2307 rv = drbd_asb_recover_0p(mdev);
2308 break;
2309 case ASB_DISCONNECT:
2310 break;
2311 case ASB_CALL_HELPER:
2312 hg = drbd_asb_recover_0p(mdev);
2313 if (hg == -1) {
Andreas Gruenbacherbb437942010-12-09 14:02:35 +01002314 enum drbd_state_rv rv2;
2315
Philipp Reisnerb411b362009-09-25 16:07:19 -07002316 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2317 * we might be here in C_WF_REPORT_PARAMS which is transient.
2318 * we do not need to wait for the after state change work either. */
Andreas Gruenbacherbb437942010-12-09 14:02:35 +01002319 rv2 = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
2320 if (rv2 != SS_SUCCESS) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07002321 drbd_khelper(mdev, "pri-lost-after-sb");
2322 } else {
2323 dev_warn(DEV, "Successfully gave up primary role.\n");
2324 rv = hg;
2325 }
2326 } else
2327 rv = hg;
2328 }
2329
2330 return rv;
2331}
2332
2333static void drbd_uuid_dump(struct drbd_conf *mdev, char *text, u64 *uuid,
2334 u64 bits, u64 flags)
2335{
2336 if (!uuid) {
2337 dev_info(DEV, "%s uuid info vanished while I was looking!\n", text);
2338 return;
2339 }
2340 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
2341 text,
2342 (unsigned long long)uuid[UI_CURRENT],
2343 (unsigned long long)uuid[UI_BITMAP],
2344 (unsigned long long)uuid[UI_HISTORY_START],
2345 (unsigned long long)uuid[UI_HISTORY_END],
2346 (unsigned long long)bits,
2347 (unsigned long long)flags);
2348}
2349
2350/*
2351 100 after split brain try auto recover
2352 2 C_SYNC_SOURCE set BitMap
2353 1 C_SYNC_SOURCE use BitMap
2354 0 no Sync
2355 -1 C_SYNC_TARGET use BitMap
2356 -2 C_SYNC_TARGET set BitMap
2357 -100 after split brain, disconnect
2358-1000 unrelated data
Philipp Reisner4a23f262011-01-11 17:42:17 +01002359-1091 requires proto 91
2360-1096 requires proto 96
Philipp Reisnerb411b362009-09-25 16:07:19 -07002361 */
2362static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(local)
2363{
2364 u64 self, peer;
2365 int i, j;
2366
2367 self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
2368 peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
2369
2370 *rule_nr = 10;
2371 if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED)
2372 return 0;
2373
2374 *rule_nr = 20;
2375 if ((self == UUID_JUST_CREATED || self == (u64)0) &&
2376 peer != UUID_JUST_CREATED)
2377 return -2;
2378
2379 *rule_nr = 30;
2380 if (self != UUID_JUST_CREATED &&
2381 (peer == UUID_JUST_CREATED || peer == (u64)0))
2382 return 2;
2383
2384 if (self == peer) {
2385 int rct, dc; /* roles at crash time */
2386
2387 if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) {
2388
2389 if (mdev->agreed_pro_version < 91)
Philipp Reisner4a23f262011-01-11 17:42:17 +01002390 return -1091;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002391
2392 if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
2393 (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
2394 dev_info(DEV, "was SyncSource, missed the resync finished event, corrected myself:\n");
Philipp Reisner9f2247b2012-08-16 14:25:58 +02002395 drbd_uuid_move_history(mdev);
2396 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
2397 mdev->ldev->md.uuid[UI_BITMAP] = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002398
2399 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
2400 mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
2401 *rule_nr = 34;
2402 } else {
2403 dev_info(DEV, "was SyncSource (peer failed to write sync_uuid)\n");
2404 *rule_nr = 36;
2405 }
2406
2407 return 1;
2408 }
2409
2410 if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) {
2411
2412 if (mdev->agreed_pro_version < 91)
Philipp Reisner4a23f262011-01-11 17:42:17 +01002413 return -1091;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002414
2415 if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) &&
2416 (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
2417 dev_info(DEV, "was SyncTarget, peer missed the resync finished event, corrected peer:\n");
2418
2419 mdev->p_uuid[UI_HISTORY_START + 1] = mdev->p_uuid[UI_HISTORY_START];
2420 mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_BITMAP];
2421 mdev->p_uuid[UI_BITMAP] = 0UL;
2422
2423 drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
2424 *rule_nr = 35;
2425 } else {
2426 dev_info(DEV, "was SyncTarget (failed to write sync_uuid)\n");
2427 *rule_nr = 37;
2428 }
2429
2430 return -1;
2431 }
2432
2433 /* Common power [off|failure] */
2434 rct = (test_bit(CRASHED_PRIMARY, &mdev->flags) ? 1 : 0) +
2435 (mdev->p_uuid[UI_FLAGS] & 2);
2436 /* lowest bit is set when we were primary,
2437 * next bit (weight 2) is set when peer was primary */
2438 *rule_nr = 40;
2439
2440 switch (rct) {
2441 case 0: /* !self_pri && !peer_pri */ return 0;
2442 case 1: /* self_pri && !peer_pri */ return 1;
2443 case 2: /* !self_pri && peer_pri */ return -1;
2444 case 3: /* self_pri && peer_pri */
2445 dc = test_bit(DISCARD_CONCURRENT, &mdev->flags);
2446 return dc ? -1 : 1;
2447 }
2448 }
2449
2450 *rule_nr = 50;
2451 peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
2452 if (self == peer)
2453 return -1;
2454
2455 *rule_nr = 51;
2456 peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
2457 if (self == peer) {
Philipp Reisner4a23f262011-01-11 17:42:17 +01002458 if (mdev->agreed_pro_version < 96 ?
2459 (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) ==
2460 (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) :
2461 peer + UUID_NEW_BM_OFFSET == (mdev->p_uuid[UI_BITMAP] & ~((u64)1))) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07002462 /* The last P_SYNC_UUID did not get though. Undo the last start of
2463 resync as sync source modifications of the peer's UUIDs. */
2464
2465 if (mdev->agreed_pro_version < 91)
Philipp Reisner4a23f262011-01-11 17:42:17 +01002466 return -1091;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002467
2468 mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START];
2469 mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1];
Philipp Reisner4a23f262011-01-11 17:42:17 +01002470
Lars Ellenberg92b4ca22012-04-30 12:53:52 +02002471 dev_info(DEV, "Lost last syncUUID packet, corrected:\n");
Philipp Reisner4a23f262011-01-11 17:42:17 +01002472 drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
2473
Philipp Reisnerb411b362009-09-25 16:07:19 -07002474 return -1;
2475 }
2476 }
2477
2478 *rule_nr = 60;
2479 self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
2480 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2481 peer = mdev->p_uuid[i] & ~((u64)1);
2482 if (self == peer)
2483 return -2;
2484 }
2485
2486 *rule_nr = 70;
2487 self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
2488 peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
2489 if (self == peer)
2490 return 1;
2491
2492 *rule_nr = 71;
2493 self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
2494 if (self == peer) {
Philipp Reisner4a23f262011-01-11 17:42:17 +01002495 if (mdev->agreed_pro_version < 96 ?
2496 (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) ==
2497 (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) :
2498 self + UUID_NEW_BM_OFFSET == (mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07002499 /* The last P_SYNC_UUID did not get though. Undo the last start of
2500 resync as sync source modifications of our UUIDs. */
2501
2502 if (mdev->agreed_pro_version < 91)
Philipp Reisner4a23f262011-01-11 17:42:17 +01002503 return -1091;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002504
Philipp Reisner9f2247b2012-08-16 14:25:58 +02002505 __drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]);
2506 __drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002507
Philipp Reisner4a23f262011-01-11 17:42:17 +01002508 dev_info(DEV, "Last syncUUID did not get through, corrected:\n");
Philipp Reisnerb411b362009-09-25 16:07:19 -07002509 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
2510 mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
2511
2512 return 1;
2513 }
2514 }
2515
2516
2517 *rule_nr = 80;
Philipp Reisnerd8c2a362009-11-18 15:52:51 +01002518 peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002519 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2520 self = mdev->ldev->md.uuid[i] & ~((u64)1);
2521 if (self == peer)
2522 return 2;
2523 }
2524
2525 *rule_nr = 90;
2526 self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
2527 peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
2528 if (self == peer && self != ((u64)0))
2529 return 100;
2530
2531 *rule_nr = 100;
2532 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2533 self = mdev->ldev->md.uuid[i] & ~((u64)1);
2534 for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) {
2535 peer = mdev->p_uuid[j] & ~((u64)1);
2536 if (self == peer)
2537 return -100;
2538 }
2539 }
2540
2541 return -1000;
2542}
2543
2544/* drbd_sync_handshake() returns the new conn state on success, or
2545 CONN_MASK (-1) on failure.
2546 */
2547static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role,
2548 enum drbd_disk_state peer_disk) __must_hold(local)
2549{
2550 int hg, rule_nr;
2551 enum drbd_conns rv = C_MASK;
2552 enum drbd_disk_state mydisk;
2553
2554 mydisk = mdev->state.disk;
2555 if (mydisk == D_NEGOTIATING)
2556 mydisk = mdev->new_state_tmp.disk;
2557
2558 dev_info(DEV, "drbd_sync_handshake:\n");
Philipp Reisner9f2247b2012-08-16 14:25:58 +02002559
2560 spin_lock_irq(&mdev->ldev->md.uuid_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002561 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, mdev->comm_bm_set, 0);
2562 drbd_uuid_dump(mdev, "peer", mdev->p_uuid,
2563 mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
2564
2565 hg = drbd_uuid_compare(mdev, &rule_nr);
Philipp Reisner9f2247b2012-08-16 14:25:58 +02002566 spin_unlock_irq(&mdev->ldev->md.uuid_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002567
2568 dev_info(DEV, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
2569
2570 if (hg == -1000) {
2571 dev_alert(DEV, "Unrelated data, aborting!\n");
2572 return C_MASK;
2573 }
Philipp Reisner4a23f262011-01-11 17:42:17 +01002574 if (hg < -1000) {
2575 dev_alert(DEV, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002576 return C_MASK;
2577 }
2578
2579 if ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) ||
2580 (peer_disk == D_INCONSISTENT && mydisk > D_INCONSISTENT)) {
2581 int f = (hg == -100) || abs(hg) == 2;
2582 hg = mydisk > D_INCONSISTENT ? 1 : -1;
2583 if (f)
2584 hg = hg*2;
2585 dev_info(DEV, "Becoming sync %s due to disk states.\n",
2586 hg > 0 ? "source" : "target");
2587 }
2588
Adam Gandelman3a11a482010-04-08 16:48:23 -07002589 if (abs(hg) == 100)
2590 drbd_khelper(mdev, "initial-split-brain");
2591
Philipp Reisnerb411b362009-09-25 16:07:19 -07002592 if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) {
2593 int pcount = (mdev->state.role == R_PRIMARY)
2594 + (peer_role == R_PRIMARY);
2595 int forced = (hg == -100);
2596
2597 switch (pcount) {
2598 case 0:
2599 hg = drbd_asb_recover_0p(mdev);
2600 break;
2601 case 1:
2602 hg = drbd_asb_recover_1p(mdev);
2603 break;
2604 case 2:
2605 hg = drbd_asb_recover_2p(mdev);
2606 break;
2607 }
2608 if (abs(hg) < 100) {
2609 dev_warn(DEV, "Split-Brain detected, %d primaries, "
2610 "automatically solved. Sync from %s node\n",
2611 pcount, (hg < 0) ? "peer" : "this");
2612 if (forced) {
2613 dev_warn(DEV, "Doing a full sync, since"
2614 " UUIDs where ambiguous.\n");
2615 hg = hg*2;
2616 }
2617 }
2618 }
2619
2620 if (hg == -100) {
2621 if (mdev->net_conf->want_lose && !(mdev->p_uuid[UI_FLAGS]&1))
2622 hg = -1;
2623 if (!mdev->net_conf->want_lose && (mdev->p_uuid[UI_FLAGS]&1))
2624 hg = 1;
2625
2626 if (abs(hg) < 100)
2627 dev_warn(DEV, "Split-Brain detected, manually solved. "
2628 "Sync from %s node\n",
2629 (hg < 0) ? "peer" : "this");
2630 }
2631
2632 if (hg == -100) {
Lars Ellenberg580b9762010-02-26 23:15:23 +01002633 /* FIXME this log message is not correct if we end up here
2634 * after an attempted attach on a diskless node.
2635 * We just refuse to attach -- well, we drop the "connection"
2636 * to that disk, in a way... */
Adam Gandelman3a11a482010-04-08 16:48:23 -07002637 dev_alert(DEV, "Split-Brain detected but unresolved, dropping connection!\n");
Philipp Reisnerb411b362009-09-25 16:07:19 -07002638 drbd_khelper(mdev, "split-brain");
2639 return C_MASK;
2640 }
2641
2642 if (hg > 0 && mydisk <= D_INCONSISTENT) {
2643 dev_err(DEV, "I shall become SyncSource, but I am inconsistent!\n");
2644 return C_MASK;
2645 }
2646
2647 if (hg < 0 && /* by intention we do not use mydisk here. */
2648 mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) {
2649 switch (mdev->net_conf->rr_conflict) {
2650 case ASB_CALL_HELPER:
2651 drbd_khelper(mdev, "pri-lost");
2652 /* fall through */
2653 case ASB_DISCONNECT:
2654 dev_err(DEV, "I shall become SyncTarget, but I am primary!\n");
2655 return C_MASK;
2656 case ASB_VIOLENTLY:
2657 dev_warn(DEV, "Becoming SyncTarget, violating the stable-data"
2658 "assumption\n");
2659 }
2660 }
2661
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01002662 if (mdev->net_conf->dry_run || test_bit(CONN_DRY_RUN, &mdev->flags)) {
2663 if (hg == 0)
2664 dev_info(DEV, "dry-run connect: No resync, would become Connected immediately.\n");
2665 else
2666 dev_info(DEV, "dry-run connect: Would become %s, doing a %s resync.",
2667 drbd_conn_str(hg > 0 ? C_SYNC_SOURCE : C_SYNC_TARGET),
2668 abs(hg) >= 2 ? "full" : "bit-map based");
2669 return C_MASK;
2670 }
2671
Philipp Reisnerb411b362009-09-25 16:07:19 -07002672 if (abs(hg) >= 2) {
2673 dev_info(DEV, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01002674 if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from sync_handshake",
2675 BM_LOCKED_SET_ALLOWED))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002676 return C_MASK;
2677 }
2678
2679 if (hg > 0) { /* become sync source. */
2680 rv = C_WF_BITMAP_S;
2681 } else if (hg < 0) { /* become sync target */
2682 rv = C_WF_BITMAP_T;
2683 } else {
2684 rv = C_CONNECTED;
2685 if (drbd_bm_total_weight(mdev)) {
2686 dev_info(DEV, "No resync, but %lu bits in bitmap!\n",
2687 drbd_bm_total_weight(mdev));
2688 }
2689 }
2690
2691 return rv;
2692}
2693
2694/* returns 1 if invalid */
2695static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self)
2696{
2697 /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
2698 if ((peer == ASB_DISCARD_REMOTE && self == ASB_DISCARD_LOCAL) ||
2699 (self == ASB_DISCARD_REMOTE && peer == ASB_DISCARD_LOCAL))
2700 return 0;
2701
2702 /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
2703 if (peer == ASB_DISCARD_REMOTE || peer == ASB_DISCARD_LOCAL ||
2704 self == ASB_DISCARD_REMOTE || self == ASB_DISCARD_LOCAL)
2705 return 1;
2706
2707 /* everything else is valid if they are equal on both sides. */
2708 if (peer == self)
2709 return 0;
2710
2711 /* everything es is invalid. */
2712 return 1;
2713}
2714
Philipp Reisner02918be2010-08-20 14:35:10 +02002715static int receive_protocol(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002716{
Philipp Reisner02918be2010-08-20 14:35:10 +02002717 struct p_protocol *p = &mdev->data.rbuf.protocol;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002718 int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01002719 int p_want_lose, p_two_primaries, cf;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002720 char p_integrity_alg[SHARED_SECRET_MAX] = "";
2721
Philipp Reisnerb411b362009-09-25 16:07:19 -07002722 p_proto = be32_to_cpu(p->protocol);
2723 p_after_sb_0p = be32_to_cpu(p->after_sb_0p);
2724 p_after_sb_1p = be32_to_cpu(p->after_sb_1p);
2725 p_after_sb_2p = be32_to_cpu(p->after_sb_2p);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002726 p_two_primaries = be32_to_cpu(p->two_primaries);
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01002727 cf = be32_to_cpu(p->conn_flags);
2728 p_want_lose = cf & CF_WANT_LOSE;
2729
2730 clear_bit(CONN_DRY_RUN, &mdev->flags);
2731
2732 if (cf & CF_DRY_RUN)
2733 set_bit(CONN_DRY_RUN, &mdev->flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002734
2735 if (p_proto != mdev->net_conf->wire_protocol) {
2736 dev_err(DEV, "incompatible communication protocols\n");
2737 goto disconnect;
2738 }
2739
2740 if (cmp_after_sb(p_after_sb_0p, mdev->net_conf->after_sb_0p)) {
2741 dev_err(DEV, "incompatible after-sb-0pri settings\n");
2742 goto disconnect;
2743 }
2744
2745 if (cmp_after_sb(p_after_sb_1p, mdev->net_conf->after_sb_1p)) {
2746 dev_err(DEV, "incompatible after-sb-1pri settings\n");
2747 goto disconnect;
2748 }
2749
2750 if (cmp_after_sb(p_after_sb_2p, mdev->net_conf->after_sb_2p)) {
2751 dev_err(DEV, "incompatible after-sb-2pri settings\n");
2752 goto disconnect;
2753 }
2754
2755 if (p_want_lose && mdev->net_conf->want_lose) {
2756 dev_err(DEV, "both sides have the 'want_lose' flag set\n");
2757 goto disconnect;
2758 }
2759
2760 if (p_two_primaries != mdev->net_conf->two_primaries) {
2761 dev_err(DEV, "incompatible setting of the two-primaries options\n");
2762 goto disconnect;
2763 }
2764
2765 if (mdev->agreed_pro_version >= 87) {
2766 unsigned char *my_alg = mdev->net_conf->integrity_alg;
2767
2768 if (drbd_recv(mdev, p_integrity_alg, data_size) != data_size)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002769 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002770
2771 p_integrity_alg[SHARED_SECRET_MAX-1] = 0;
2772 if (strcmp(p_integrity_alg, my_alg)) {
2773 dev_err(DEV, "incompatible setting of the data-integrity-alg\n");
2774 goto disconnect;
2775 }
2776 dev_info(DEV, "data-integrity-alg: %s\n",
2777 my_alg[0] ? my_alg : (unsigned char *)"<not-used>");
2778 }
2779
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002780 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002781
2782disconnect:
2783 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002784 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002785}
2786
2787/* helper function
2788 * input: alg name, feature name
2789 * return: NULL (alg name was "")
2790 * ERR_PTR(error) if something goes wrong
2791 * or the crypto hash ptr, if it worked out ok. */
2792struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev,
2793 const char *alg, const char *name)
2794{
2795 struct crypto_hash *tfm;
2796
2797 if (!alg[0])
2798 return NULL;
2799
2800 tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC);
2801 if (IS_ERR(tfm)) {
2802 dev_err(DEV, "Can not allocate \"%s\" as %s (reason: %ld)\n",
2803 alg, name, PTR_ERR(tfm));
2804 return tfm;
2805 }
2806 if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) {
2807 crypto_free_hash(tfm);
2808 dev_err(DEV, "\"%s\" is not a digest (%s)\n", alg, name);
2809 return ERR_PTR(-EINVAL);
2810 }
2811 return tfm;
2812}
2813
Philipp Reisner02918be2010-08-20 14:35:10 +02002814static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int packet_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002815{
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002816 int ok = true;
Philipp Reisner02918be2010-08-20 14:35:10 +02002817 struct p_rs_param_95 *p = &mdev->data.rbuf.rs_param_95;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002818 unsigned int header_size, data_size, exp_max_sz;
2819 struct crypto_hash *verify_tfm = NULL;
2820 struct crypto_hash *csums_tfm = NULL;
2821 const int apv = mdev->agreed_pro_version;
Philipp Reisner778f2712010-07-06 11:14:00 +02002822 int *rs_plan_s = NULL;
2823 int fifo_size = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002824
2825 exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param)
2826 : apv == 88 ? sizeof(struct p_rs_param)
2827 + SHARED_SECRET_MAX
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02002828 : apv <= 94 ? sizeof(struct p_rs_param_89)
2829 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002830
Philipp Reisner02918be2010-08-20 14:35:10 +02002831 if (packet_size > exp_max_sz) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07002832 dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n",
Philipp Reisner02918be2010-08-20 14:35:10 +02002833 packet_size, exp_max_sz);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002834 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002835 }
2836
2837 if (apv <= 88) {
Philipp Reisner02918be2010-08-20 14:35:10 +02002838 header_size = sizeof(struct p_rs_param) - sizeof(struct p_header80);
2839 data_size = packet_size - header_size;
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02002840 } else if (apv <= 94) {
Philipp Reisner02918be2010-08-20 14:35:10 +02002841 header_size = sizeof(struct p_rs_param_89) - sizeof(struct p_header80);
2842 data_size = packet_size - header_size;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002843 D_ASSERT(data_size == 0);
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02002844 } else {
Philipp Reisner02918be2010-08-20 14:35:10 +02002845 header_size = sizeof(struct p_rs_param_95) - sizeof(struct p_header80);
2846 data_size = packet_size - header_size;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002847 D_ASSERT(data_size == 0);
2848 }
2849
2850 /* initialize verify_alg and csums_alg */
2851 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
2852
Philipp Reisner02918be2010-08-20 14:35:10 +02002853 if (drbd_recv(mdev, &p->head.payload, header_size) != header_size)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002854 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002855
2856 mdev->sync_conf.rate = be32_to_cpu(p->rate);
2857
2858 if (apv >= 88) {
2859 if (apv == 88) {
Philipp Reisner5de73822012-03-28 10:17:32 +02002860 if (data_size > SHARED_SECRET_MAX || data_size == 0) {
2861 dev_err(DEV, "verify-alg of wrong size, "
2862 "peer wants %u, accepting only up to %u byte\n",
2863 data_size, SHARED_SECRET_MAX);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002864 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002865 }
2866
2867 if (drbd_recv(mdev, p->verify_alg, data_size) != data_size)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002868 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002869
2870 /* we expect NUL terminated string */
2871 /* but just in case someone tries to be evil */
2872 D_ASSERT(p->verify_alg[data_size-1] == 0);
2873 p->verify_alg[data_size-1] = 0;
2874
2875 } else /* apv >= 89 */ {
2876 /* we still expect NUL terminated strings */
2877 /* but just in case someone tries to be evil */
2878 D_ASSERT(p->verify_alg[SHARED_SECRET_MAX-1] == 0);
2879 D_ASSERT(p->csums_alg[SHARED_SECRET_MAX-1] == 0);
2880 p->verify_alg[SHARED_SECRET_MAX-1] = 0;
2881 p->csums_alg[SHARED_SECRET_MAX-1] = 0;
2882 }
2883
2884 if (strcmp(mdev->sync_conf.verify_alg, p->verify_alg)) {
2885 if (mdev->state.conn == C_WF_REPORT_PARAMS) {
2886 dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
2887 mdev->sync_conf.verify_alg, p->verify_alg);
2888 goto disconnect;
2889 }
2890 verify_tfm = drbd_crypto_alloc_digest_safe(mdev,
2891 p->verify_alg, "verify-alg");
2892 if (IS_ERR(verify_tfm)) {
2893 verify_tfm = NULL;
2894 goto disconnect;
2895 }
2896 }
2897
2898 if (apv >= 89 && strcmp(mdev->sync_conf.csums_alg, p->csums_alg)) {
2899 if (mdev->state.conn == C_WF_REPORT_PARAMS) {
2900 dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
2901 mdev->sync_conf.csums_alg, p->csums_alg);
2902 goto disconnect;
2903 }
2904 csums_tfm = drbd_crypto_alloc_digest_safe(mdev,
2905 p->csums_alg, "csums-alg");
2906 if (IS_ERR(csums_tfm)) {
2907 csums_tfm = NULL;
2908 goto disconnect;
2909 }
2910 }
2911
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02002912 if (apv > 94) {
2913 mdev->sync_conf.rate = be32_to_cpu(p->rate);
2914 mdev->sync_conf.c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
2915 mdev->sync_conf.c_delay_target = be32_to_cpu(p->c_delay_target);
2916 mdev->sync_conf.c_fill_target = be32_to_cpu(p->c_fill_target);
2917 mdev->sync_conf.c_max_rate = be32_to_cpu(p->c_max_rate);
Philipp Reisner778f2712010-07-06 11:14:00 +02002918
2919 fifo_size = (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ;
2920 if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) {
2921 rs_plan_s = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL);
2922 if (!rs_plan_s) {
2923 dev_err(DEV, "kmalloc of fifo_buffer failed");
2924 goto disconnect;
2925 }
2926 }
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02002927 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002928
2929 spin_lock(&mdev->peer_seq_lock);
2930 /* lock against drbd_nl_syncer_conf() */
2931 if (verify_tfm) {
2932 strcpy(mdev->sync_conf.verify_alg, p->verify_alg);
2933 mdev->sync_conf.verify_alg_len = strlen(p->verify_alg) + 1;
2934 crypto_free_hash(mdev->verify_tfm);
2935 mdev->verify_tfm = verify_tfm;
2936 dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg);
2937 }
2938 if (csums_tfm) {
2939 strcpy(mdev->sync_conf.csums_alg, p->csums_alg);
2940 mdev->sync_conf.csums_alg_len = strlen(p->csums_alg) + 1;
2941 crypto_free_hash(mdev->csums_tfm);
2942 mdev->csums_tfm = csums_tfm;
2943 dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg);
2944 }
Philipp Reisner778f2712010-07-06 11:14:00 +02002945 if (fifo_size != mdev->rs_plan_s.size) {
2946 kfree(mdev->rs_plan_s.values);
2947 mdev->rs_plan_s.values = rs_plan_s;
2948 mdev->rs_plan_s.size = fifo_size;
2949 mdev->rs_planed = 0;
2950 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002951 spin_unlock(&mdev->peer_seq_lock);
2952 }
2953
2954 return ok;
2955disconnect:
2956 /* just for completeness: actually not needed,
2957 * as this is not reached if csums_tfm was ok. */
2958 crypto_free_hash(csums_tfm);
2959 /* but free the verify_tfm again, if csums_tfm did not work out */
2960 crypto_free_hash(verify_tfm);
2961 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002962 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002963}
2964
Philipp Reisnerb411b362009-09-25 16:07:19 -07002965/* warn if the arguments differ by more than 12.5% */
2966static void warn_if_differ_considerably(struct drbd_conf *mdev,
2967 const char *s, sector_t a, sector_t b)
2968{
2969 sector_t d;
2970 if (a == 0 || b == 0)
2971 return;
2972 d = (a > b) ? (a - b) : (b - a);
2973 if (d > (a>>3) || d > (b>>3))
2974 dev_warn(DEV, "Considerable difference in %s: %llus vs. %llus\n", s,
2975 (unsigned long long)a, (unsigned long long)b);
2976}
2977
Philipp Reisner02918be2010-08-20 14:35:10 +02002978static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002979{
Philipp Reisner02918be2010-08-20 14:35:10 +02002980 struct p_sizes *p = &mdev->data.rbuf.sizes;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002981 enum determine_dev_size dd = unchanged;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002982 sector_t p_size, p_usize, my_usize;
2983 int ldsc = 0; /* local disk size changed */
Philipp Reisnere89b5912010-03-24 17:11:33 +01002984 enum dds_flags ddsf;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002985
Philipp Reisnerb411b362009-09-25 16:07:19 -07002986 p_size = be64_to_cpu(p->d_size);
2987 p_usize = be64_to_cpu(p->u_size);
2988
2989 if (p_size == 0 && mdev->state.disk == D_DISKLESS) {
2990 dev_err(DEV, "some backing storage is needed\n");
2991 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002992 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002993 }
2994
2995 /* just store the peer's disk size for now.
2996 * we still need to figure out whether we accept that. */
2997 mdev->p_size = p_size;
2998
Philipp Reisnerb411b362009-09-25 16:07:19 -07002999 if (get_ldev(mdev)) {
3000 warn_if_differ_considerably(mdev, "lower level device sizes",
3001 p_size, drbd_get_max_capacity(mdev->ldev));
3002 warn_if_differ_considerably(mdev, "user requested size",
3003 p_usize, mdev->ldev->dc.disk_size);
3004
3005 /* if this is the first connect, or an otherwise expected
3006 * param exchange, choose the minimum */
3007 if (mdev->state.conn == C_WF_REPORT_PARAMS)
3008 p_usize = min_not_zero((sector_t)mdev->ldev->dc.disk_size,
3009 p_usize);
3010
3011 my_usize = mdev->ldev->dc.disk_size;
3012
3013 if (mdev->ldev->dc.disk_size != p_usize) {
3014 mdev->ldev->dc.disk_size = p_usize;
3015 dev_info(DEV, "Peer sets u_size to %lu sectors\n",
3016 (unsigned long)mdev->ldev->dc.disk_size);
3017 }
3018
3019 /* Never shrink a device with usable data during connect.
3020 But allow online shrinking if we are connected. */
Philipp Reisnera393db62009-12-22 13:35:52 +01003021 if (drbd_new_dev_size(mdev, mdev->ldev, 0) <
Philipp Reisnerb411b362009-09-25 16:07:19 -07003022 drbd_get_capacity(mdev->this_bdev) &&
3023 mdev->state.disk >= D_OUTDATED &&
3024 mdev->state.conn < C_CONNECTED) {
3025 dev_err(DEV, "The peer's disk size is too small!\n");
3026 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
3027 mdev->ldev->dc.disk_size = my_usize;
3028 put_ldev(mdev);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003029 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003030 }
3031 put_ldev(mdev);
3032 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07003033
Philipp Reisnere89b5912010-03-24 17:11:33 +01003034 ddsf = be16_to_cpu(p->dds_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003035 if (get_ldev(mdev)) {
Bart Van Assche24c48302011-05-21 18:32:29 +02003036 dd = drbd_determine_dev_size(mdev, ddsf);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003037 put_ldev(mdev);
3038 if (dd == dev_size_error)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003039 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003040 drbd_md_sync(mdev);
3041 } else {
3042 /* I am diskless, need to accept the peer's size. */
3043 drbd_set_my_capacity(mdev, p_size);
3044 }
3045
Philipp Reisner99432fc2011-05-20 16:39:13 +02003046 mdev->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
3047 drbd_reconsider_max_bio_size(mdev);
3048
Philipp Reisnerb411b362009-09-25 16:07:19 -07003049 if (get_ldev(mdev)) {
3050 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
3051 mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
3052 ldsc = 1;
3053 }
3054
Philipp Reisnerb411b362009-09-25 16:07:19 -07003055 put_ldev(mdev);
3056 }
3057
3058 if (mdev->state.conn > C_WF_REPORT_PARAMS) {
3059 if (be64_to_cpu(p->c_size) !=
3060 drbd_get_capacity(mdev->this_bdev) || ldsc) {
3061 /* we have different sizes, probably peer
3062 * needs to know my new size... */
Philipp Reisnere89b5912010-03-24 17:11:33 +01003063 drbd_send_sizes(mdev, 0, ddsf);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003064 }
3065 if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) ||
3066 (dd == grew && mdev->state.conn == C_CONNECTED)) {
3067 if (mdev->state.pdsk >= D_INCONSISTENT &&
Philipp Reisnere89b5912010-03-24 17:11:33 +01003068 mdev->state.disk >= D_INCONSISTENT) {
3069 if (ddsf & DDSF_NO_RESYNC)
3070 dev_info(DEV, "Resync of new storage suppressed with --assume-clean\n");
3071 else
3072 resync_after_online_grow(mdev);
3073 } else
Philipp Reisnerb411b362009-09-25 16:07:19 -07003074 set_bit(RESYNC_AFTER_NEG, &mdev->flags);
3075 }
3076 }
3077
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003078 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003079}
3080
Philipp Reisner02918be2010-08-20 14:35:10 +02003081static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003082{
Philipp Reisner02918be2010-08-20 14:35:10 +02003083 struct p_uuids *p = &mdev->data.rbuf.uuids;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003084 u64 *p_uuid;
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003085 int i, updated_uuids = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003086
Philipp Reisnerb411b362009-09-25 16:07:19 -07003087 p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
3088
3089 for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
3090 p_uuid[i] = be64_to_cpu(p->uuid[i]);
3091
3092 kfree(mdev->p_uuid);
3093 mdev->p_uuid = p_uuid;
3094
3095 if (mdev->state.conn < C_CONNECTED &&
3096 mdev->state.disk < D_INCONSISTENT &&
3097 mdev->state.role == R_PRIMARY &&
3098 (mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
3099 dev_err(DEV, "Can only connect to data with current UUID=%016llX\n",
3100 (unsigned long long)mdev->ed_uuid);
3101 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003102 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003103 }
3104
3105 if (get_ldev(mdev)) {
3106 int skip_initial_sync =
3107 mdev->state.conn == C_CONNECTED &&
3108 mdev->agreed_pro_version >= 90 &&
3109 mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
3110 (p_uuid[UI_FLAGS] & 8);
3111 if (skip_initial_sync) {
3112 dev_info(DEV, "Accepted new current UUID, preparing to skip initial sync\n");
3113 drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write,
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01003114 "clear_n_write from receive_uuids",
3115 BM_LOCKED_TEST_ALLOWED);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003116 _drbd_uuid_set(mdev, UI_CURRENT, p_uuid[UI_CURRENT]);
3117 _drbd_uuid_set(mdev, UI_BITMAP, 0);
3118 _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
3119 CS_VERBOSE, NULL);
3120 drbd_md_sync(mdev);
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003121 updated_uuids = 1;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003122 }
3123 put_ldev(mdev);
Philipp Reisner18a50fa2010-06-21 14:14:15 +02003124 } else if (mdev->state.disk < D_INCONSISTENT &&
3125 mdev->state.role == R_PRIMARY) {
3126 /* I am a diskless primary, the peer just created a new current UUID
3127 for me. */
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003128 updated_uuids = drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003129 }
3130
3131 /* Before we test for the disk state, we should wait until an eventually
3132 ongoing cluster wide state change is finished. That is important if
3133 we are primary and are detaching from our disk. We need to see the
3134 new disk state... */
3135 wait_event(mdev->misc_wait, !test_bit(CLUSTER_ST_CHANGE, &mdev->flags));
3136 if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT)
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003137 updated_uuids |= drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
3138
3139 if (updated_uuids)
3140 drbd_print_uuids(mdev, "receiver updated UUIDs to");
Philipp Reisnerb411b362009-09-25 16:07:19 -07003141
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003142 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003143}
3144
3145/**
3146 * convert_state() - Converts the peer's view of the cluster state to our point of view
3147 * @ps: The state as seen by the peer.
3148 */
3149static union drbd_state convert_state(union drbd_state ps)
3150{
3151 union drbd_state ms;
3152
3153 static enum drbd_conns c_tab[] = {
3154 [C_CONNECTED] = C_CONNECTED,
3155
3156 [C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
3157 [C_STARTING_SYNC_T] = C_STARTING_SYNC_S,
3158 [C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */
3159 [C_VERIFY_S] = C_VERIFY_T,
3160 [C_MASK] = C_MASK,
3161 };
3162
3163 ms.i = ps.i;
3164
3165 ms.conn = c_tab[ps.conn];
3166 ms.peer = ps.role;
3167 ms.role = ps.peer;
3168 ms.pdsk = ps.disk;
3169 ms.disk = ps.pdsk;
3170 ms.peer_isp = (ps.aftr_isp | ps.user_isp);
3171
3172 return ms;
3173}
3174
Philipp Reisner02918be2010-08-20 14:35:10 +02003175static int receive_req_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003176{
Philipp Reisner02918be2010-08-20 14:35:10 +02003177 struct p_req_state *p = &mdev->data.rbuf.req_state;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003178 union drbd_state mask, val;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +01003179 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003180
Philipp Reisnerb411b362009-09-25 16:07:19 -07003181 mask.i = be32_to_cpu(p->mask);
3182 val.i = be32_to_cpu(p->val);
3183
3184 if (test_bit(DISCARD_CONCURRENT, &mdev->flags) &&
3185 test_bit(CLUSTER_ST_CHANGE, &mdev->flags)) {
3186 drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003187 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003188 }
3189
3190 mask = convert_state(mask);
3191 val = convert_state(val);
3192
3193 rv = drbd_change_state(mdev, CS_VERBOSE, mask, val);
3194
3195 drbd_send_sr_reply(mdev, rv);
3196 drbd_md_sync(mdev);
3197
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003198 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003199}
3200
Philipp Reisner02918be2010-08-20 14:35:10 +02003201static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003202{
Philipp Reisner02918be2010-08-20 14:35:10 +02003203 struct p_state *p = &mdev->data.rbuf.state;
Lars Ellenberg4ac4aad2010-07-22 17:39:26 +02003204 union drbd_state os, ns, peer_state;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003205 enum drbd_disk_state real_peer_disk;
Philipp Reisner65d922c2010-06-16 16:18:09 +02003206 enum chg_state_flags cs_flags;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003207 int rv;
3208
Philipp Reisnerb411b362009-09-25 16:07:19 -07003209 peer_state.i = be32_to_cpu(p->state);
3210
3211 real_peer_disk = peer_state.disk;
3212 if (peer_state.disk == D_NEGOTIATING) {
3213 real_peer_disk = mdev->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT;
3214 dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
3215 }
3216
3217 spin_lock_irq(&mdev->req_lock);
3218 retry:
Lars Ellenberg4ac4aad2010-07-22 17:39:26 +02003219 os = ns = mdev->state;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003220 spin_unlock_irq(&mdev->req_lock);
3221
Lars Ellenberg545752d2011-12-05 14:39:25 +01003222 /* If some other part of the code (asender thread, timeout)
3223 * already decided to close the connection again,
3224 * we must not "re-establish" it here. */
3225 if (os.conn <= C_TEAR_DOWN)
3226 return false;
3227
Lars Ellenberg40424e42011-09-26 15:24:56 +02003228 /* If this is the "end of sync" confirmation, usually the peer disk
3229 * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits
3230 * set) resync started in PausedSyncT, or if the timing of pause-/
3231 * unpause-sync events has been "just right", the peer disk may
3232 * transition from D_CONSISTENT to D_UP_TO_DATE as well.
3233 */
3234 if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) &&
3235 real_peer_disk == D_UP_TO_DATE &&
Lars Ellenberge9ef7bb2010-10-07 15:55:39 +02003236 os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
3237 /* If we are (becoming) SyncSource, but peer is still in sync
3238 * preparation, ignore its uptodate-ness to avoid flapping, it
3239 * will change to inconsistent once the peer reaches active
3240 * syncing states.
3241 * It may have changed syncer-paused flags, however, so we
3242 * cannot ignore this completely. */
3243 if (peer_state.conn > C_CONNECTED &&
3244 peer_state.conn < C_SYNC_SOURCE)
3245 real_peer_disk = D_INCONSISTENT;
3246
3247 /* if peer_state changes to connected at the same time,
3248 * it explicitly notifies us that it finished resync.
3249 * Maybe we should finish it up, too? */
3250 else if (os.conn >= C_SYNC_SOURCE &&
3251 peer_state.conn == C_CONNECTED) {
3252 if (drbd_bm_total_weight(mdev) <= mdev->rs_failed)
3253 drbd_resync_finished(mdev);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003254 return true;
Lars Ellenberge9ef7bb2010-10-07 15:55:39 +02003255 }
3256 }
3257
3258 /* peer says his disk is inconsistent, while we think it is uptodate,
3259 * and this happens while the peer still thinks we have a sync going on,
3260 * but we think we are already done with the sync.
3261 * We ignore this to avoid flapping pdsk.
3262 * This should not happen, if the peer is a recent version of drbd. */
3263 if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT &&
3264 os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE)
3265 real_peer_disk = D_UP_TO_DATE;
3266
Lars Ellenberg4ac4aad2010-07-22 17:39:26 +02003267 if (ns.conn == C_WF_REPORT_PARAMS)
3268 ns.conn = C_CONNECTED;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003269
Philipp Reisner67531712010-10-27 12:21:30 +02003270 if (peer_state.conn == C_AHEAD)
3271 ns.conn = C_BEHIND;
3272
Philipp Reisnerb411b362009-09-25 16:07:19 -07003273 if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING &&
3274 get_ldev_if_state(mdev, D_NEGOTIATING)) {
3275 int cr; /* consider resync */
3276
3277 /* if we established a new connection */
Lars Ellenberg4ac4aad2010-07-22 17:39:26 +02003278 cr = (os.conn < C_CONNECTED);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003279 /* if we had an established connection
3280 * and one of the nodes newly attaches a disk */
Lars Ellenberg4ac4aad2010-07-22 17:39:26 +02003281 cr |= (os.conn == C_CONNECTED &&
Philipp Reisnerb411b362009-09-25 16:07:19 -07003282 (peer_state.disk == D_NEGOTIATING ||
Lars Ellenberg4ac4aad2010-07-22 17:39:26 +02003283 os.disk == D_NEGOTIATING));
Philipp Reisnerb411b362009-09-25 16:07:19 -07003284 /* if we have both been inconsistent, and the peer has been
3285 * forced to be UpToDate with --overwrite-data */
3286 cr |= test_bit(CONSIDER_RESYNC, &mdev->flags);
3287 /* if we had been plain connected, and the admin requested to
3288 * start a sync by "invalidate" or "invalidate-remote" */
Lars Ellenberg4ac4aad2010-07-22 17:39:26 +02003289 cr |= (os.conn == C_CONNECTED &&
Philipp Reisnerb411b362009-09-25 16:07:19 -07003290 (peer_state.conn >= C_STARTING_SYNC_S &&
3291 peer_state.conn <= C_WF_BITMAP_T));
3292
3293 if (cr)
Lars Ellenberg4ac4aad2010-07-22 17:39:26 +02003294 ns.conn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003295
3296 put_ldev(mdev);
Lars Ellenberg4ac4aad2010-07-22 17:39:26 +02003297 if (ns.conn == C_MASK) {
3298 ns.conn = C_CONNECTED;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003299 if (mdev->state.disk == D_NEGOTIATING) {
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02003300 drbd_force_state(mdev, NS(disk, D_FAILED));
Philipp Reisnerb411b362009-09-25 16:07:19 -07003301 } else if (peer_state.disk == D_NEGOTIATING) {
3302 dev_err(DEV, "Disk attach process on the peer node was aborted.\n");
3303 peer_state.disk = D_DISKLESS;
Lars Ellenberg580b9762010-02-26 23:15:23 +01003304 real_peer_disk = D_DISKLESS;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003305 } else {
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01003306 if (test_and_clear_bit(CONN_DRY_RUN, &mdev->flags))
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003307 return false;
Lars Ellenberg4ac4aad2010-07-22 17:39:26 +02003308 D_ASSERT(os.conn == C_WF_REPORT_PARAMS);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003309 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003310 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003311 }
3312 }
3313 }
3314
3315 spin_lock_irq(&mdev->req_lock);
Lars Ellenberg4ac4aad2010-07-22 17:39:26 +02003316 if (mdev->state.i != os.i)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003317 goto retry;
3318 clear_bit(CONSIDER_RESYNC, &mdev->flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003319 ns.peer = peer_state.role;
3320 ns.pdsk = real_peer_disk;
3321 ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
Lars Ellenberg4ac4aad2010-07-22 17:39:26 +02003322 if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003323 ns.disk = mdev->new_state_tmp.disk;
Lars Ellenberg4ac4aad2010-07-22 17:39:26 +02003324 cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
3325 if (ns.pdsk == D_CONSISTENT && is_susp(ns) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
Philipp Reisner481c6f52010-06-22 14:03:27 +02003326 test_bit(NEW_CUR_UUID, &mdev->flags)) {
3327 /* Do not allow tl_restart(resend) for a rebooted peer. We can only allow this
3328 for temporal network outages! */
3329 spin_unlock_irq(&mdev->req_lock);
3330 dev_err(DEV, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
3331 tl_clear(mdev);
3332 drbd_uuid_new_current(mdev);
3333 clear_bit(NEW_CUR_UUID, &mdev->flags);
3334 drbd_force_state(mdev, NS2(conn, C_PROTOCOL_ERROR, susp, 0));
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003335 return false;
Philipp Reisner481c6f52010-06-22 14:03:27 +02003336 }
Philipp Reisner65d922c2010-06-16 16:18:09 +02003337 rv = _drbd_set_state(mdev, ns, cs_flags, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003338 ns = mdev->state;
3339 spin_unlock_irq(&mdev->req_lock);
3340
3341 if (rv < SS_SUCCESS) {
3342 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003343 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003344 }
3345
Lars Ellenberg4ac4aad2010-07-22 17:39:26 +02003346 if (os.conn > C_WF_REPORT_PARAMS) {
3347 if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
Philipp Reisnerb411b362009-09-25 16:07:19 -07003348 peer_state.disk != D_NEGOTIATING ) {
3349 /* we want resync, peer has not yet decided to sync... */
3350 /* Nowadays only used when forcing a node into primary role and
3351 setting its disk to UpToDate with that */
3352 drbd_send_uuids(mdev);
Lars Ellenbergf479ea02011-10-27 16:52:30 +02003353 drbd_send_current_state(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003354 }
3355 }
3356
3357 mdev->net_conf->want_lose = 0;
3358
3359 drbd_md_sync(mdev); /* update connected indicator, la_size, ... */
3360
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003361 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003362}
3363
Philipp Reisner02918be2010-08-20 14:35:10 +02003364static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003365{
Philipp Reisner02918be2010-08-20 14:35:10 +02003366 struct p_rs_uuid *p = &mdev->data.rbuf.rs_uuid;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003367
3368 wait_event(mdev->misc_wait,
3369 mdev->state.conn == C_WF_SYNC_UUID ||
Philipp Reisnerc4752ef2010-10-27 17:32:36 +02003370 mdev->state.conn == C_BEHIND ||
Philipp Reisnerb411b362009-09-25 16:07:19 -07003371 mdev->state.conn < C_CONNECTED ||
3372 mdev->state.disk < D_NEGOTIATING);
3373
3374 /* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */
3375
Philipp Reisnerb411b362009-09-25 16:07:19 -07003376 /* Here the _drbd_uuid_ functions are right, current should
3377 _not_ be rotated into the history */
3378 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
3379 _drbd_uuid_set(mdev, UI_CURRENT, be64_to_cpu(p->uuid));
3380 _drbd_uuid_set(mdev, UI_BITMAP, 0UL);
3381
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003382 drbd_print_uuids(mdev, "updated sync uuid");
Philipp Reisnerb411b362009-09-25 16:07:19 -07003383 drbd_start_resync(mdev, C_SYNC_TARGET);
3384
3385 put_ldev(mdev);
3386 } else
3387 dev_err(DEV, "Ignoring SyncUUID packet!\n");
3388
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003389 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003390}
3391
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003392/**
3393 * receive_bitmap_plain
3394 *
3395 * Return 0 when done, 1 when another iteration is needed, and a negative error
3396 * code upon failure.
3397 */
3398static int
Philipp Reisner02918be2010-08-20 14:35:10 +02003399receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size,
3400 unsigned long *buffer, struct bm_xfer_ctx *c)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003401{
3402 unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
3403 unsigned want = num_words * sizeof(long);
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003404 int err;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003405
Philipp Reisner02918be2010-08-20 14:35:10 +02003406 if (want != data_size) {
3407 dev_err(DEV, "%s:want (%u) != data_size (%u)\n", __func__, want, data_size);
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003408 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003409 }
3410 if (want == 0)
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003411 return 0;
3412 err = drbd_recv(mdev, buffer, want);
3413 if (err != want) {
3414 if (err >= 0)
3415 err = -EIO;
3416 return err;
3417 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07003418
3419 drbd_bm_merge_lel(mdev, c->word_offset, num_words, buffer);
3420
3421 c->word_offset += num_words;
3422 c->bit_offset = c->word_offset * BITS_PER_LONG;
3423 if (c->bit_offset > c->bm_bits)
3424 c->bit_offset = c->bm_bits;
3425
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003426 return 1;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003427}
3428
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003429/**
3430 * recv_bm_rle_bits
3431 *
3432 * Return 0 when done, 1 when another iteration is needed, and a negative error
3433 * code upon failure.
3434 */
3435static int
Philipp Reisnerb411b362009-09-25 16:07:19 -07003436recv_bm_rle_bits(struct drbd_conf *mdev,
3437 struct p_compressed_bm *p,
3438 struct bm_xfer_ctx *c)
3439{
3440 struct bitstream bs;
3441 u64 look_ahead;
3442 u64 rl;
3443 u64 tmp;
3444 unsigned long s = c->bit_offset;
3445 unsigned long e;
Lars Ellenberg004352f2010-10-05 20:13:58 +02003446 int len = be16_to_cpu(p->head.length) - (sizeof(*p) - sizeof(p->head));
Philipp Reisnerb411b362009-09-25 16:07:19 -07003447 int toggle = DCBP_get_start(p);
3448 int have;
3449 int bits;
3450
3451 bitstream_init(&bs, p->code, len, DCBP_get_pad_bits(p));
3452
3453 bits = bitstream_get_bits(&bs, &look_ahead, 64);
3454 if (bits < 0)
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003455 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003456
3457 for (have = bits; have > 0; s += rl, toggle = !toggle) {
3458 bits = vli_decode_bits(&rl, look_ahead);
3459 if (bits <= 0)
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003460 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003461
3462 if (toggle) {
3463 e = s + rl -1;
3464 if (e >= c->bm_bits) {
3465 dev_err(DEV, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003466 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003467 }
3468 _drbd_bm_set_bits(mdev, s, e);
3469 }
3470
3471 if (have < bits) {
3472 dev_err(DEV, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n",
3473 have, bits, look_ahead,
3474 (unsigned int)(bs.cur.b - p->code),
3475 (unsigned int)bs.buf_len);
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003476 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003477 }
3478 look_ahead >>= bits;
3479 have -= bits;
3480
3481 bits = bitstream_get_bits(&bs, &tmp, 64 - have);
3482 if (bits < 0)
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003483 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003484 look_ahead |= tmp << have;
3485 have += bits;
3486 }
3487
3488 c->bit_offset = s;
3489 bm_xfer_ctx_bit_to_word_offset(c);
3490
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003491 return (s != c->bm_bits);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003492}
3493
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003494/**
3495 * decode_bitmap_c
3496 *
3497 * Return 0 when done, 1 when another iteration is needed, and a negative error
3498 * code upon failure.
3499 */
3500static int
Philipp Reisnerb411b362009-09-25 16:07:19 -07003501decode_bitmap_c(struct drbd_conf *mdev,
3502 struct p_compressed_bm *p,
3503 struct bm_xfer_ctx *c)
3504{
3505 if (DCBP_get_code(p) == RLE_VLI_Bits)
3506 return recv_bm_rle_bits(mdev, p, c);
3507
3508 /* other variants had been implemented for evaluation,
3509 * but have been dropped as this one turned out to be "best"
3510 * during all our tests. */
3511
3512 dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
3513 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003514 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003515}
3516
3517void INFO_bm_xfer_stats(struct drbd_conf *mdev,
3518 const char *direction, struct bm_xfer_ctx *c)
3519{
3520 /* what would it take to transfer it "plaintext" */
Philipp Reisner0b70a132010-08-20 13:36:10 +02003521 unsigned plain = sizeof(struct p_header80) *
Philipp Reisnerb411b362009-09-25 16:07:19 -07003522 ((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1)
3523 + c->bm_words * sizeof(long);
3524 unsigned total = c->bytes[0] + c->bytes[1];
3525 unsigned r;
3526
3527 /* total can not be zero. but just in case: */
3528 if (total == 0)
3529 return;
3530
3531 /* don't report if not compressed */
3532 if (total >= plain)
3533 return;
3534
3535 /* total < plain. check for overflow, still */
3536 r = (total > UINT_MAX/1000) ? (total / (plain/1000))
3537 : (1000 * total / plain);
3538
3539 if (r > 1000)
3540 r = 1000;
3541
3542 r = 1000 - r;
3543 dev_info(DEV, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), "
3544 "total %u; compression: %u.%u%%\n",
3545 direction,
3546 c->bytes[1], c->packets[1],
3547 c->bytes[0], c->packets[0],
3548 total, r/10, r % 10);
3549}
3550
3551/* Since we are processing the bitfield from lower addresses to higher,
3552 it does not matter if the process it in 32 bit chunks or 64 bit
3553 chunks as long as it is little endian. (Understand it as byte stream,
3554 beginning with the lowest byte...) If we would use big endian
3555 we would need to process it from the highest address to the lowest,
3556 in order to be agnostic to the 32 vs 64 bits issue.
3557
3558 returns 0 on failure, 1 if we successfully received it. */
Philipp Reisner02918be2010-08-20 14:35:10 +02003559static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003560{
3561 struct bm_xfer_ctx c;
3562 void *buffer;
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003563 int err;
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003564 int ok = false;
Philipp Reisner02918be2010-08-20 14:35:10 +02003565 struct p_header80 *h = &mdev->data.rbuf.header.h80;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003566
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01003567 drbd_bm_lock(mdev, "receive bitmap", BM_LOCKED_SET_ALLOWED);
3568 /* you are supposed to send additional out-of-sync information
3569 * if you actually set bits during this phase */
Philipp Reisnerb411b362009-09-25 16:07:19 -07003570
3571 /* maybe we should use some per thread scratch page,
3572 * and allocate that during initial device creation? */
3573 buffer = (unsigned long *) __get_free_page(GFP_NOIO);
3574 if (!buffer) {
3575 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
3576 goto out;
3577 }
3578
3579 c = (struct bm_xfer_ctx) {
3580 .bm_bits = drbd_bm_bits(mdev),
3581 .bm_words = drbd_bm_words(mdev),
3582 };
3583
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003584 for(;;) {
Philipp Reisner02918be2010-08-20 14:35:10 +02003585 if (cmd == P_BITMAP) {
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003586 err = receive_bitmap_plain(mdev, data_size, buffer, &c);
Philipp Reisner02918be2010-08-20 14:35:10 +02003587 } else if (cmd == P_COMPRESSED_BITMAP) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003588 /* MAYBE: sanity check that we speak proto >= 90,
3589 * and the feature is enabled! */
3590 struct p_compressed_bm *p;
3591
Philipp Reisner02918be2010-08-20 14:35:10 +02003592 if (data_size > BM_PACKET_PAYLOAD_BYTES) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003593 dev_err(DEV, "ReportCBitmap packet too large\n");
3594 goto out;
3595 }
3596 /* use the page buff */
3597 p = buffer;
3598 memcpy(p, h, sizeof(*h));
Philipp Reisner02918be2010-08-20 14:35:10 +02003599 if (drbd_recv(mdev, p->head.payload, data_size) != data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003600 goto out;
Lars Ellenberg004352f2010-10-05 20:13:58 +02003601 if (data_size <= (sizeof(*p) - sizeof(p->head))) {
3602 dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", data_size);
Andreas Gruenbacher78fcbda2010-12-10 22:18:27 +01003603 goto out;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003604 }
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003605 err = decode_bitmap_c(mdev, p, &c);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003606 } else {
Philipp Reisner02918be2010-08-20 14:35:10 +02003607 dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", cmd);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003608 goto out;
3609 }
3610
Philipp Reisner02918be2010-08-20 14:35:10 +02003611 c.packets[cmd == P_BITMAP]++;
3612 c.bytes[cmd == P_BITMAP] += sizeof(struct p_header80) + data_size;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003613
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003614 if (err <= 0) {
3615 if (err < 0)
3616 goto out;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003617 break;
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003618 }
Philipp Reisner02918be2010-08-20 14:35:10 +02003619 if (!drbd_recv_header(mdev, &cmd, &data_size))
Philipp Reisnerb411b362009-09-25 16:07:19 -07003620 goto out;
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003621 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07003622
3623 INFO_bm_xfer_stats(mdev, "receive", &c);
3624
3625 if (mdev->state.conn == C_WF_BITMAP_T) {
Andreas Gruenbacherde1f8e42010-12-10 21:04:00 +01003626 enum drbd_state_rv rv;
3627
Philipp Reisnerb411b362009-09-25 16:07:19 -07003628 ok = !drbd_send_bitmap(mdev);
3629 if (!ok)
3630 goto out;
3631 /* Omit CS_ORDERED with this state transition to avoid deadlocks. */
Andreas Gruenbacherde1f8e42010-12-10 21:04:00 +01003632 rv = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
3633 D_ASSERT(rv == SS_SUCCESS);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003634 } else if (mdev->state.conn != C_WF_BITMAP_S) {
3635 /* admin may have requested C_DISCONNECTING,
3636 * other threads may have noticed network errors */
3637 dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n",
3638 drbd_conn_str(mdev->state.conn));
3639 }
3640
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003641 ok = true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003642 out:
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01003643 drbd_bm_unlock(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003644 if (ok && mdev->state.conn == C_WF_BITMAP_S)
3645 drbd_start_resync(mdev, C_SYNC_SOURCE);
3646 free_page((unsigned long) buffer);
3647 return ok;
3648}
3649
Philipp Reisner02918be2010-08-20 14:35:10 +02003650static int receive_skip(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003651{
3652 /* TODO zero copy sink :) */
3653 static char sink[128];
3654 int size, want, r;
3655
Philipp Reisner02918be2010-08-20 14:35:10 +02003656 dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n",
3657 cmd, data_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003658
Philipp Reisner02918be2010-08-20 14:35:10 +02003659 size = data_size;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003660 while (size > 0) {
3661 want = min_t(int, size, sizeof(sink));
3662 r = drbd_recv(mdev, sink, want);
3663 ERR_IF(r <= 0) break;
3664 size -= r;
3665 }
3666 return size == 0;
3667}
3668
Philipp Reisner02918be2010-08-20 14:35:10 +02003669static int receive_UnplugRemote(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003670{
Philipp Reisnerb411b362009-09-25 16:07:19 -07003671 /* Make sure we've acked all the TCP data associated
3672 * with the data requests being unplugged */
3673 drbd_tcp_quickack(mdev->data.socket);
3674
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003675 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003676}
3677
Philipp Reisner73a01a12010-10-27 14:33:00 +02003678static int receive_out_of_sync(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
3679{
3680 struct p_block_desc *p = &mdev->data.rbuf.block_desc;
3681
Lars Ellenbergf735e3632010-12-17 21:06:18 +01003682 switch (mdev->state.conn) {
3683 case C_WF_SYNC_UUID:
3684 case C_WF_BITMAP_T:
3685 case C_BEHIND:
3686 break;
3687 default:
3688 dev_err(DEV, "ASSERT FAILED cstate = %s, expected: WFSyncUUID|WFBitMapT|Behind\n",
3689 drbd_conn_str(mdev->state.conn));
3690 }
3691
Philipp Reisner73a01a12010-10-27 14:33:00 +02003692 drbd_set_out_of_sync(mdev, be64_to_cpu(p->sector), be32_to_cpu(p->blksize));
3693
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003694 return true;
Philipp Reisner73a01a12010-10-27 14:33:00 +02003695}
3696
Philipp Reisner02918be2010-08-20 14:35:10 +02003697typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, enum drbd_packets cmd, unsigned int to_receive);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003698
Philipp Reisner02918be2010-08-20 14:35:10 +02003699struct data_cmd {
3700 int expect_payload;
3701 size_t pkt_size;
3702 drbd_cmd_handler_f function;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003703};
3704
Philipp Reisner02918be2010-08-20 14:35:10 +02003705static struct data_cmd drbd_cmd_handler[] = {
3706 [P_DATA] = { 1, sizeof(struct p_data), receive_Data },
3707 [P_DATA_REPLY] = { 1, sizeof(struct p_data), receive_DataReply },
3708 [P_RS_DATA_REPLY] = { 1, sizeof(struct p_data), receive_RSDataReply } ,
3709 [P_BARRIER] = { 0, sizeof(struct p_barrier), receive_Barrier } ,
3710 [P_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } ,
3711 [P_COMPRESSED_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } ,
3712 [P_UNPLUG_REMOTE] = { 0, sizeof(struct p_header80), receive_UnplugRemote },
3713 [P_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
3714 [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
3715 [P_SYNC_PARAM] = { 1, sizeof(struct p_header80), receive_SyncParam },
3716 [P_SYNC_PARAM89] = { 1, sizeof(struct p_header80), receive_SyncParam },
3717 [P_PROTOCOL] = { 1, sizeof(struct p_protocol), receive_protocol },
3718 [P_UUIDS] = { 0, sizeof(struct p_uuids), receive_uuids },
3719 [P_SIZES] = { 0, sizeof(struct p_sizes), receive_sizes },
3720 [P_STATE] = { 0, sizeof(struct p_state), receive_state },
3721 [P_STATE_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_state },
3722 [P_SYNC_UUID] = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid },
3723 [P_OV_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
3724 [P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest },
3725 [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
3726 [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip },
Philipp Reisner73a01a12010-10-27 14:33:00 +02003727 [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
Philipp Reisner02918be2010-08-20 14:35:10 +02003728 /* anything missing from this table is in
3729 * the asender_tbl, see get_asender_cmd */
3730 [P_MAX_CMD] = { 0, 0, NULL },
3731};
3732
3733/* All handler functions that expect a sub-header get that sub-heder in
3734 mdev->data.rbuf.header.head.payload.
3735
3736 Usually in mdev->data.rbuf.header.head the callback can find the usual
3737 p_header, but they may not rely on that. Since there is also p_header95 !
3738 */
Philipp Reisnerb411b362009-09-25 16:07:19 -07003739
3740static void drbdd(struct drbd_conf *mdev)
3741{
Philipp Reisner02918be2010-08-20 14:35:10 +02003742 union p_header *header = &mdev->data.rbuf.header;
3743 unsigned int packet_size;
3744 enum drbd_packets cmd;
3745 size_t shs; /* sub header size */
3746 int rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003747
3748 while (get_t_state(&mdev->receiver) == Running) {
3749 drbd_thread_current_set_cpu(mdev);
Philipp Reisner02918be2010-08-20 14:35:10 +02003750 if (!drbd_recv_header(mdev, &cmd, &packet_size))
3751 goto err_out;
3752
3753 if (unlikely(cmd >= P_MAX_CMD || !drbd_cmd_handler[cmd].function)) {
3754 dev_err(DEV, "unknown packet type %d, l: %d!\n", cmd, packet_size);
3755 goto err_out;
Lars Ellenberg0b33a912009-11-16 15:58:04 +01003756 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07003757
Philipp Reisner02918be2010-08-20 14:35:10 +02003758 shs = drbd_cmd_handler[cmd].pkt_size - sizeof(union p_header);
Philipp Reisner02918be2010-08-20 14:35:10 +02003759 if (packet_size - shs > 0 && !drbd_cmd_handler[cmd].expect_payload) {
3760 dev_err(DEV, "No payload expected %s l:%d\n", cmdname(cmd), packet_size);
3761 goto err_out;
3762 }
3763
Lars Ellenbergc13f7e12010-10-29 23:32:01 +02003764 if (shs) {
3765 rv = drbd_recv(mdev, &header->h80.payload, shs);
3766 if (unlikely(rv != shs)) {
Lars Ellenberg0ddc5542011-01-21 12:35:15 +01003767 if (!signal_pending(current))
3768 dev_warn(DEV, "short read while reading sub header: rv=%d\n", rv);
Lars Ellenbergc13f7e12010-10-29 23:32:01 +02003769 goto err_out;
3770 }
3771 }
3772
Philipp Reisner02918be2010-08-20 14:35:10 +02003773 rv = drbd_cmd_handler[cmd].function(mdev, cmd, packet_size - shs);
3774
3775 if (unlikely(!rv)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003776 dev_err(DEV, "error receiving %s, l: %d!\n",
Philipp Reisner02918be2010-08-20 14:35:10 +02003777 cmdname(cmd), packet_size);
3778 goto err_out;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003779 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07003780 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07003781
Philipp Reisner02918be2010-08-20 14:35:10 +02003782 if (0) {
3783 err_out:
3784 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
Philipp Reisnerb411b362009-09-25 16:07:19 -07003785 }
Lars Ellenberg856c50c2010-10-14 13:37:40 +02003786 /* If we leave here, we probably want to update at least the
3787 * "Connected" indicator on stable storage. Do so explicitly here. */
3788 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003789}
3790
3791void drbd_flush_workqueue(struct drbd_conf *mdev)
3792{
3793 struct drbd_wq_barrier barr;
3794
3795 barr.w.cb = w_prev_work_done;
3796 init_completion(&barr.done);
3797 drbd_queue_work(&mdev->data.work, &barr.w);
3798 wait_for_completion(&barr.done);
3799}
3800
Philipp Reisnerf70b35112010-06-24 14:34:40 +02003801void drbd_free_tl_hash(struct drbd_conf *mdev)
3802{
3803 struct hlist_head *h;
3804
3805 spin_lock_irq(&mdev->req_lock);
3806
3807 if (!mdev->tl_hash || mdev->state.conn != C_STANDALONE) {
3808 spin_unlock_irq(&mdev->req_lock);
3809 return;
3810 }
3811 /* paranoia code */
3812 for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
3813 if (h->first)
3814 dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
3815 (int)(h - mdev->ee_hash), h->first);
3816 kfree(mdev->ee_hash);
3817 mdev->ee_hash = NULL;
3818 mdev->ee_hash_s = 0;
3819
Lars Ellenbergc12e9c82012-06-19 09:40:00 +02003820 /* We may not have had the chance to wait for all locally pending
3821 * application requests. The hlist_add_fake() prevents access after
3822 * free on master bio completion. */
3823 for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++) {
3824 struct drbd_request *req;
3825 struct hlist_node *pos, *n;
3826 hlist_for_each_entry_safe(req, pos, n, h, collision) {
3827 hlist_del_init(&req->collision);
3828 hlist_add_fake(&req->collision);
3829 }
3830 }
3831
Philipp Reisnerf70b35112010-06-24 14:34:40 +02003832 kfree(mdev->tl_hash);
3833 mdev->tl_hash = NULL;
3834 mdev->tl_hash_s = 0;
3835 spin_unlock_irq(&mdev->req_lock);
3836}
3837
Philipp Reisnerb411b362009-09-25 16:07:19 -07003838static void drbd_disconnect(struct drbd_conf *mdev)
3839{
3840 enum drbd_fencing_p fp;
3841 union drbd_state os, ns;
3842 int rv = SS_UNKNOWN_ERROR;
3843 unsigned int i;
3844
3845 if (mdev->state.conn == C_STANDALONE)
3846 return;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003847
Lars Ellenberg545752d2011-12-05 14:39:25 +01003848 /* We are about to start the cleanup after connection loss.
3849 * Make sure drbd_make_request knows about that.
3850 * Usually we should be in some network failure state already,
3851 * but just in case we are not, we fix it up here.
3852 */
3853 drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
3854
Philipp Reisnerb411b362009-09-25 16:07:19 -07003855 /* asender does not clean up anything. it must not interfere, either */
3856 drbd_thread_stop(&mdev->asender);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003857 drbd_free_sock(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003858
Philipp Reisner85719572010-07-21 10:20:17 +02003859 /* wait for current activity to cease. */
Philipp Reisnerb411b362009-09-25 16:07:19 -07003860 spin_lock_irq(&mdev->req_lock);
3861 _drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
3862 _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee);
3863 _drbd_wait_ee_list_empty(mdev, &mdev->read_ee);
3864 spin_unlock_irq(&mdev->req_lock);
3865
3866 /* We do not have data structures that would allow us to
3867 * get the rs_pending_cnt down to 0 again.
3868 * * On C_SYNC_TARGET we do not have any data structures describing
3869 * the pending RSDataRequest's we have sent.
3870 * * On C_SYNC_SOURCE there is no data structure that tracks
3871 * the P_RS_DATA_REPLY blocks that we sent to the SyncTarget.
3872 * And no, it is not the sum of the reference counts in the
3873 * resync_LRU. The resync_LRU tracks the whole operation including
3874 * the disk-IO, while the rs_pending_cnt only tracks the blocks
3875 * on the fly. */
3876 drbd_rs_cancel_all(mdev);
3877 mdev->rs_total = 0;
3878 mdev->rs_failed = 0;
3879 atomic_set(&mdev->rs_pending_cnt, 0);
3880 wake_up(&mdev->misc_wait);
3881
3882 /* make sure syncer is stopped and w_resume_next_sg queued */
3883 del_timer_sync(&mdev->resync_timer);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003884 resync_timer_fn((unsigned long)mdev);
3885
Philipp Reisnerb411b362009-09-25 16:07:19 -07003886 /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
3887 * w_make_resync_request etc. which may still be on the worker queue
3888 * to be "canceled" */
3889 drbd_flush_workqueue(mdev);
3890
3891 /* This also does reclaim_net_ee(). If we do this too early, we might
3892 * miss some resync ee and pages.*/
3893 drbd_process_done_ee(mdev);
3894
3895 kfree(mdev->p_uuid);
3896 mdev->p_uuid = NULL;
3897
Philipp Reisnerfb22c402010-09-08 23:20:21 +02003898 if (!is_susp(mdev->state))
Philipp Reisnerb411b362009-09-25 16:07:19 -07003899 tl_clear(mdev);
3900
Philipp Reisnerb411b362009-09-25 16:07:19 -07003901 dev_info(DEV, "Connection closed\n");
3902
3903 drbd_md_sync(mdev);
3904
3905 fp = FP_DONT_CARE;
3906 if (get_ldev(mdev)) {
3907 fp = mdev->ldev->dc.fencing;
3908 put_ldev(mdev);
3909 }
3910
Philipp Reisner87f7be42010-06-11 13:56:33 +02003911 if (mdev->state.role == R_PRIMARY && fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN)
3912 drbd_try_outdate_peer_async(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003913
3914 spin_lock_irq(&mdev->req_lock);
3915 os = mdev->state;
3916 if (os.conn >= C_UNCONNECTED) {
3917 /* Do not restart in case we are C_DISCONNECTING */
3918 ns = os;
3919 ns.conn = C_UNCONNECTED;
3920 rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
3921 }
3922 spin_unlock_irq(&mdev->req_lock);
3923
3924 if (os.conn == C_DISCONNECTING) {
Philipp Reisner84dfb9f2010-06-23 11:20:05 +02003925 wait_event(mdev->net_cnt_wait, atomic_read(&mdev->net_cnt) == 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003926
Philipp Reisnerb411b362009-09-25 16:07:19 -07003927 crypto_free_hash(mdev->cram_hmac_tfm);
3928 mdev->cram_hmac_tfm = NULL;
3929
3930 kfree(mdev->net_conf);
3931 mdev->net_conf = NULL;
3932 drbd_request_state(mdev, NS(conn, C_STANDALONE));
3933 }
3934
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01003935 /* serialize with bitmap writeout triggered by the state change,
3936 * if any. */
3937 wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
3938
Philipp Reisnerb411b362009-09-25 16:07:19 -07003939 /* tcp_close and release of sendpage pages can be deferred. I don't
3940 * want to use SO_LINGER, because apparently it can be deferred for
3941 * more than 20 seconds (longest time I checked).
3942 *
3943 * Actually we don't care for exactly when the network stack does its
3944 * put_page(), but release our reference on these pages right here.
3945 */
3946 i = drbd_release_ee(mdev, &mdev->net_ee);
3947 if (i)
3948 dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
Lars Ellenberg435f0742010-09-06 12:30:25 +02003949 i = atomic_read(&mdev->pp_in_use_by_net);
3950 if (i)
3951 dev_info(DEV, "pp_in_use_by_net = %d, expected 0\n", i);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003952 i = atomic_read(&mdev->pp_in_use);
3953 if (i)
Lars Ellenberg45bb9122010-05-14 17:10:48 +02003954 dev_info(DEV, "pp_in_use = %d, expected 0\n", i);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003955
3956 D_ASSERT(list_empty(&mdev->read_ee));
3957 D_ASSERT(list_empty(&mdev->active_ee));
3958 D_ASSERT(list_empty(&mdev->sync_ee));
3959 D_ASSERT(list_empty(&mdev->done_ee));
3960
3961 /* ok, no more ee's on the fly, it is safe to reset the epoch_size */
3962 atomic_set(&mdev->current_epoch->epoch_size, 0);
3963 D_ASSERT(list_empty(&mdev->current_epoch->list));
3964}
3965
3966/*
3967 * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version
3968 * we can agree on is stored in agreed_pro_version.
3969 *
3970 * feature flags and the reserved array should be enough room for future
3971 * enhancements of the handshake protocol, and possible plugins...
3972 *
3973 * for now, they are expected to be zero, but ignored.
3974 */
3975static int drbd_send_handshake(struct drbd_conf *mdev)
3976{
3977 /* ASSERT current == mdev->receiver ... */
3978 struct p_handshake *p = &mdev->data.sbuf.handshake;
3979 int ok;
3980
3981 if (mutex_lock_interruptible(&mdev->data.mutex)) {
3982 dev_err(DEV, "interrupted during initial handshake\n");
3983 return 0; /* interrupted. not ok. */
3984 }
3985
3986 if (mdev->data.socket == NULL) {
3987 mutex_unlock(&mdev->data.mutex);
3988 return 0;
3989 }
3990
3991 memset(p, 0, sizeof(*p));
3992 p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
3993 p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
3994 ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE,
Philipp Reisner0b70a132010-08-20 13:36:10 +02003995 (struct p_header80 *)p, sizeof(*p), 0 );
Philipp Reisnerb411b362009-09-25 16:07:19 -07003996 mutex_unlock(&mdev->data.mutex);
3997 return ok;
3998}
3999
4000/*
4001 * return values:
4002 * 1 yes, we have a valid connection
4003 * 0 oops, did not work out, please try again
4004 * -1 peer talks different language,
4005 * no point in trying again, please go standalone.
4006 */
4007static int drbd_do_handshake(struct drbd_conf *mdev)
4008{
4009 /* ASSERT current == mdev->receiver ... */
4010 struct p_handshake *p = &mdev->data.rbuf.handshake;
Philipp Reisner02918be2010-08-20 14:35:10 +02004011 const int expect = sizeof(struct p_handshake) - sizeof(struct p_header80);
4012 unsigned int length;
4013 enum drbd_packets cmd;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004014 int rv;
4015
4016 rv = drbd_send_handshake(mdev);
4017 if (!rv)
4018 return 0;
4019
Philipp Reisner02918be2010-08-20 14:35:10 +02004020 rv = drbd_recv_header(mdev, &cmd, &length);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004021 if (!rv)
4022 return 0;
4023
Philipp Reisner02918be2010-08-20 14:35:10 +02004024 if (cmd != P_HAND_SHAKE) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07004025 dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n",
Philipp Reisner02918be2010-08-20 14:35:10 +02004026 cmdname(cmd), cmd);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004027 return -1;
4028 }
4029
Philipp Reisner02918be2010-08-20 14:35:10 +02004030 if (length != expect) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07004031 dev_err(DEV, "expected HandShake length: %u, received: %u\n",
Philipp Reisner02918be2010-08-20 14:35:10 +02004032 expect, length);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004033 return -1;
4034 }
4035
4036 rv = drbd_recv(mdev, &p->head.payload, expect);
4037
4038 if (rv != expect) {
Lars Ellenberg0ddc5542011-01-21 12:35:15 +01004039 if (!signal_pending(current))
4040 dev_warn(DEV, "short read receiving handshake packet: l=%u\n", rv);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004041 return 0;
4042 }
4043
Philipp Reisnerb411b362009-09-25 16:07:19 -07004044 p->protocol_min = be32_to_cpu(p->protocol_min);
4045 p->protocol_max = be32_to_cpu(p->protocol_max);
4046 if (p->protocol_max == 0)
4047 p->protocol_max = p->protocol_min;
4048
4049 if (PRO_VERSION_MAX < p->protocol_min ||
4050 PRO_VERSION_MIN > p->protocol_max)
4051 goto incompat;
4052
4053 mdev->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
4054
4055 dev_info(DEV, "Handshake successful: "
4056 "Agreed network protocol version %d\n", mdev->agreed_pro_version);
4057
4058 return 1;
4059
4060 incompat:
4061 dev_err(DEV, "incompatible DRBD dialects: "
4062 "I support %d-%d, peer supports %d-%d\n",
4063 PRO_VERSION_MIN, PRO_VERSION_MAX,
4064 p->protocol_min, p->protocol_max);
4065 return -1;
4066}
4067
4068#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
4069static int drbd_do_auth(struct drbd_conf *mdev)
4070{
4071 dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
4072 dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
Johannes Thomab10d96c2010-01-07 16:02:50 +01004073 return -1;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004074}
4075#else
4076#define CHALLENGE_LEN 64
Johannes Thomab10d96c2010-01-07 16:02:50 +01004077
4078/* Return value:
4079 1 - auth succeeded,
4080 0 - failed, try again (network error),
4081 -1 - auth failed, don't try again.
4082*/
4083
Philipp Reisnerb411b362009-09-25 16:07:19 -07004084static int drbd_do_auth(struct drbd_conf *mdev)
4085{
4086 char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */
4087 struct scatterlist sg;
4088 char *response = NULL;
4089 char *right_response = NULL;
4090 char *peers_ch = NULL;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004091 unsigned int key_len = strlen(mdev->net_conf->shared_secret);
4092 unsigned int resp_size;
4093 struct hash_desc desc;
Philipp Reisner02918be2010-08-20 14:35:10 +02004094 enum drbd_packets cmd;
4095 unsigned int length;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004096 int rv;
4097
4098 desc.tfm = mdev->cram_hmac_tfm;
4099 desc.flags = 0;
4100
4101 rv = crypto_hash_setkey(mdev->cram_hmac_tfm,
4102 (u8 *)mdev->net_conf->shared_secret, key_len);
4103 if (rv) {
4104 dev_err(DEV, "crypto_hash_setkey() failed with %d\n", rv);
Johannes Thomab10d96c2010-01-07 16:02:50 +01004105 rv = -1;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004106 goto fail;
4107 }
4108
4109 get_random_bytes(my_challenge, CHALLENGE_LEN);
4110
4111 rv = drbd_send_cmd2(mdev, P_AUTH_CHALLENGE, my_challenge, CHALLENGE_LEN);
4112 if (!rv)
4113 goto fail;
4114
Philipp Reisner02918be2010-08-20 14:35:10 +02004115 rv = drbd_recv_header(mdev, &cmd, &length);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004116 if (!rv)
4117 goto fail;
4118
Philipp Reisner02918be2010-08-20 14:35:10 +02004119 if (cmd != P_AUTH_CHALLENGE) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07004120 dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n",
Philipp Reisner02918be2010-08-20 14:35:10 +02004121 cmdname(cmd), cmd);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004122 rv = 0;
4123 goto fail;
4124 }
4125
Philipp Reisner02918be2010-08-20 14:35:10 +02004126 if (length > CHALLENGE_LEN * 2) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07004127 dev_err(DEV, "expected AuthChallenge payload too big.\n");
Johannes Thomab10d96c2010-01-07 16:02:50 +01004128 rv = -1;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004129 goto fail;
4130 }
4131
Philipp Reisner02918be2010-08-20 14:35:10 +02004132 peers_ch = kmalloc(length, GFP_NOIO);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004133 if (peers_ch == NULL) {
4134 dev_err(DEV, "kmalloc of peers_ch failed\n");
Johannes Thomab10d96c2010-01-07 16:02:50 +01004135 rv = -1;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004136 goto fail;
4137 }
4138
Philipp Reisner02918be2010-08-20 14:35:10 +02004139 rv = drbd_recv(mdev, peers_ch, length);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004140
Philipp Reisner02918be2010-08-20 14:35:10 +02004141 if (rv != length) {
Lars Ellenberg0ddc5542011-01-21 12:35:15 +01004142 if (!signal_pending(current))
4143 dev_warn(DEV, "short read AuthChallenge: l=%u\n", rv);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004144 rv = 0;
4145 goto fail;
4146 }
4147
4148 resp_size = crypto_hash_digestsize(mdev->cram_hmac_tfm);
4149 response = kmalloc(resp_size, GFP_NOIO);
4150 if (response == NULL) {
4151 dev_err(DEV, "kmalloc of response failed\n");
Johannes Thomab10d96c2010-01-07 16:02:50 +01004152 rv = -1;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004153 goto fail;
4154 }
4155
4156 sg_init_table(&sg, 1);
Philipp Reisner02918be2010-08-20 14:35:10 +02004157 sg_set_buf(&sg, peers_ch, length);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004158
4159 rv = crypto_hash_digest(&desc, &sg, sg.length, response);
4160 if (rv) {
4161 dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv);
Johannes Thomab10d96c2010-01-07 16:02:50 +01004162 rv = -1;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004163 goto fail;
4164 }
4165
4166 rv = drbd_send_cmd2(mdev, P_AUTH_RESPONSE, response, resp_size);
4167 if (!rv)
4168 goto fail;
4169
Philipp Reisner02918be2010-08-20 14:35:10 +02004170 rv = drbd_recv_header(mdev, &cmd, &length);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004171 if (!rv)
4172 goto fail;
4173
Philipp Reisner02918be2010-08-20 14:35:10 +02004174 if (cmd != P_AUTH_RESPONSE) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07004175 dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n",
Philipp Reisner02918be2010-08-20 14:35:10 +02004176 cmdname(cmd), cmd);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004177 rv = 0;
4178 goto fail;
4179 }
4180
Philipp Reisner02918be2010-08-20 14:35:10 +02004181 if (length != resp_size) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07004182 dev_err(DEV, "expected AuthResponse payload of wrong size\n");
4183 rv = 0;
4184 goto fail;
4185 }
4186
4187 rv = drbd_recv(mdev, response , resp_size);
4188
4189 if (rv != resp_size) {
Lars Ellenberg0ddc5542011-01-21 12:35:15 +01004190 if (!signal_pending(current))
4191 dev_warn(DEV, "short read receiving AuthResponse: l=%u\n", rv);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004192 rv = 0;
4193 goto fail;
4194 }
4195
4196 right_response = kmalloc(resp_size, GFP_NOIO);
Julia Lawall2d1ee872009-12-27 22:27:11 +01004197 if (right_response == NULL) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07004198 dev_err(DEV, "kmalloc of right_response failed\n");
Johannes Thomab10d96c2010-01-07 16:02:50 +01004199 rv = -1;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004200 goto fail;
4201 }
4202
4203 sg_set_buf(&sg, my_challenge, CHALLENGE_LEN);
4204
4205 rv = crypto_hash_digest(&desc, &sg, sg.length, right_response);
4206 if (rv) {
4207 dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv);
Johannes Thomab10d96c2010-01-07 16:02:50 +01004208 rv = -1;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004209 goto fail;
4210 }
4211
4212 rv = !memcmp(response, right_response, resp_size);
4213
4214 if (rv)
4215 dev_info(DEV, "Peer authenticated using %d bytes of '%s' HMAC\n",
4216 resp_size, mdev->net_conf->cram_hmac_alg);
Johannes Thomab10d96c2010-01-07 16:02:50 +01004217 else
4218 rv = -1;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004219
4220 fail:
4221 kfree(peers_ch);
4222 kfree(response);
4223 kfree(right_response);
4224
4225 return rv;
4226}
4227#endif
4228
4229int drbdd_init(struct drbd_thread *thi)
4230{
4231 struct drbd_conf *mdev = thi->mdev;
4232 unsigned int minor = mdev_to_minor(mdev);
4233 int h;
4234
4235 sprintf(current->comm, "drbd%d_receiver", minor);
4236
4237 dev_info(DEV, "receiver (re)started\n");
4238
4239 do {
4240 h = drbd_connect(mdev);
4241 if (h == 0) {
4242 drbd_disconnect(mdev);
Philipp Reisner20ee6392011-01-18 15:28:59 +01004243 schedule_timeout_interruptible(HZ);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004244 }
4245 if (h == -1) {
4246 dev_warn(DEV, "Discarding network configuration.\n");
4247 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
4248 }
4249 } while (h == 0);
4250
4251 if (h > 0) {
4252 if (get_net_conf(mdev)) {
4253 drbdd(mdev);
4254 put_net_conf(mdev);
4255 }
4256 }
4257
4258 drbd_disconnect(mdev);
4259
4260 dev_info(DEV, "receiver terminated\n");
4261 return 0;
4262}
4263
4264/* ********* acknowledge sender ******** */
4265
Philipp Reisner0b70a132010-08-20 13:36:10 +02004266static int got_RqSReply(struct drbd_conf *mdev, struct p_header80 *h)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004267{
4268 struct p_req_state_reply *p = (struct p_req_state_reply *)h;
4269
4270 int retcode = be32_to_cpu(p->retcode);
4271
4272 if (retcode >= SS_SUCCESS) {
4273 set_bit(CL_ST_CHG_SUCCESS, &mdev->flags);
4274 } else {
4275 set_bit(CL_ST_CHG_FAIL, &mdev->flags);
4276 dev_err(DEV, "Requested state change failed by peer: %s (%d)\n",
4277 drbd_set_st_err_str(retcode), retcode);
4278 }
4279 wake_up(&mdev->state_wait);
4280
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01004281 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004282}
4283
Philipp Reisner0b70a132010-08-20 13:36:10 +02004284static int got_Ping(struct drbd_conf *mdev, struct p_header80 *h)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004285{
4286 return drbd_send_ping_ack(mdev);
4287
4288}
4289
Philipp Reisner0b70a132010-08-20 13:36:10 +02004290static int got_PingAck(struct drbd_conf *mdev, struct p_header80 *h)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004291{
4292 /* restore idle timeout */
4293 mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
Philipp Reisner309d1602010-03-02 15:03:44 +01004294 if (!test_and_set_bit(GOT_PING_ACK, &mdev->flags))
4295 wake_up(&mdev->misc_wait);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004296
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01004297 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004298}
4299
Philipp Reisner0b70a132010-08-20 13:36:10 +02004300static int got_IsInSync(struct drbd_conf *mdev, struct p_header80 *h)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004301{
4302 struct p_block_ack *p = (struct p_block_ack *)h;
4303 sector_t sector = be64_to_cpu(p->sector);
4304 int blksize = be32_to_cpu(p->blksize);
4305
4306 D_ASSERT(mdev->agreed_pro_version >= 89);
4307
4308 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4309
Lars Ellenberg1d53f092010-09-05 01:13:24 +02004310 if (get_ldev(mdev)) {
4311 drbd_rs_complete_io(mdev, sector);
4312 drbd_set_in_sync(mdev, sector, blksize);
4313 /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
4314 mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
4315 put_ldev(mdev);
4316 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07004317 dec_rs_pending(mdev);
Philipp Reisner778f2712010-07-06 11:14:00 +02004318 atomic_add(blksize >> 9, &mdev->rs_sect_in);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004319
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01004320 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004321}
4322
4323/* when we receive the ACK for a write request,
4324 * verify that we actually know about it */
4325static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev,
4326 u64 id, sector_t sector)
4327{
4328 struct hlist_head *slot = tl_hash_slot(mdev, sector);
4329 struct hlist_node *n;
4330 struct drbd_request *req;
4331
Bart Van Assche24c48302011-05-21 18:32:29 +02004332 hlist_for_each_entry(req, n, slot, collision) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07004333 if ((unsigned long)req == (unsigned long)id) {
4334 if (req->sector != sector) {
4335 dev_err(DEV, "_ack_id_to_req: found req %p but it has "
4336 "wrong sector (%llus versus %llus)\n", req,
4337 (unsigned long long)req->sector,
4338 (unsigned long long)sector);
4339 break;
4340 }
4341 return req;
4342 }
4343 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07004344 return NULL;
4345}
4346
4347typedef struct drbd_request *(req_validator_fn)
4348 (struct drbd_conf *mdev, u64 id, sector_t sector);
4349
4350static int validate_req_change_req_state(struct drbd_conf *mdev,
4351 u64 id, sector_t sector, req_validator_fn validator,
4352 const char *func, enum drbd_req_event what)
4353{
4354 struct drbd_request *req;
4355 struct bio_and_error m;
4356
4357 spin_lock_irq(&mdev->req_lock);
4358 req = validator(mdev, id, sector);
4359 if (unlikely(!req)) {
4360 spin_unlock_irq(&mdev->req_lock);
Philipp Reisner2deb8332011-01-17 18:39:18 +01004361
4362 dev_err(DEV, "%s: failed to find req %p, sector %llus\n", func,
4363 (void *)(unsigned long)id, (unsigned long long)sector);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01004364 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004365 }
4366 __req_mod(req, what, &m);
4367 spin_unlock_irq(&mdev->req_lock);
4368
4369 if (m.bio)
4370 complete_master_bio(mdev, &m);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01004371 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004372}
4373
Philipp Reisner0b70a132010-08-20 13:36:10 +02004374static int got_BlockAck(struct drbd_conf *mdev, struct p_header80 *h)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004375{
4376 struct p_block_ack *p = (struct p_block_ack *)h;
4377 sector_t sector = be64_to_cpu(p->sector);
4378 int blksize = be32_to_cpu(p->blksize);
4379 enum drbd_req_event what;
4380
4381 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4382
4383 if (is_syncer_block_id(p->block_id)) {
4384 drbd_set_in_sync(mdev, sector, blksize);
4385 dec_rs_pending(mdev);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01004386 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004387 }
4388 switch (be16_to_cpu(h->command)) {
4389 case P_RS_WRITE_ACK:
4390 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
4391 what = write_acked_by_peer_and_sis;
4392 break;
4393 case P_WRITE_ACK:
4394 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
4395 what = write_acked_by_peer;
4396 break;
4397 case P_RECV_ACK:
4398 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_B);
4399 what = recv_acked_by_peer;
4400 break;
4401 case P_DISCARD_ACK:
4402 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
4403 what = conflict_discarded_by_peer;
4404 break;
4405 default:
4406 D_ASSERT(0);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01004407 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004408 }
4409
4410 return validate_req_change_req_state(mdev, p->block_id, sector,
4411 _ack_id_to_req, __func__ , what);
4412}
4413
Philipp Reisner0b70a132010-08-20 13:36:10 +02004414static int got_NegAck(struct drbd_conf *mdev, struct p_header80 *h)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004415{
4416 struct p_block_ack *p = (struct p_block_ack *)h;
4417 sector_t sector = be64_to_cpu(p->sector);
Philipp Reisner2deb8332011-01-17 18:39:18 +01004418 int size = be32_to_cpu(p->blksize);
4419 struct drbd_request *req;
4420 struct bio_and_error m;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004421
4422 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4423
4424 if (is_syncer_block_id(p->block_id)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07004425 dec_rs_pending(mdev);
4426 drbd_rs_failed_io(mdev, sector, size);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01004427 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004428 }
Philipp Reisner2deb8332011-01-17 18:39:18 +01004429
4430 spin_lock_irq(&mdev->req_lock);
4431 req = _ack_id_to_req(mdev, p->block_id, sector);
4432 if (!req) {
4433 spin_unlock_irq(&mdev->req_lock);
4434 if (mdev->net_conf->wire_protocol == DRBD_PROT_A ||
4435 mdev->net_conf->wire_protocol == DRBD_PROT_B) {
4436 /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs.
4437 The master bio might already be completed, therefore the
4438 request is no longer in the collision hash.
4439 => Do not try to validate block_id as request. */
4440 /* In Protocol B we might already have got a P_RECV_ACK
4441 but then get a P_NEG_ACK after wards. */
4442 drbd_set_out_of_sync(mdev, sector, size);
4443 return true;
4444 } else {
4445 dev_err(DEV, "%s: failed to find req %p, sector %llus\n", __func__,
4446 (void *)(unsigned long)p->block_id, (unsigned long long)sector);
4447 return false;
4448 }
4449 }
4450 __req_mod(req, neg_acked, &m);
4451 spin_unlock_irq(&mdev->req_lock);
4452
4453 if (m.bio)
4454 complete_master_bio(mdev, &m);
4455 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004456}
4457
Philipp Reisner0b70a132010-08-20 13:36:10 +02004458static int got_NegDReply(struct drbd_conf *mdev, struct p_header80 *h)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004459{
4460 struct p_block_ack *p = (struct p_block_ack *)h;
4461 sector_t sector = be64_to_cpu(p->sector);
4462
4463 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4464 dev_err(DEV, "Got NegDReply; Sector %llus, len %u; Fail original request.\n",
4465 (unsigned long long)sector, be32_to_cpu(p->blksize));
4466
4467 return validate_req_change_req_state(mdev, p->block_id, sector,
4468 _ar_id_to_req, __func__ , neg_acked);
4469}
4470
Philipp Reisner0b70a132010-08-20 13:36:10 +02004471static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header80 *h)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004472{
4473 sector_t sector;
4474 int size;
4475 struct p_block_ack *p = (struct p_block_ack *)h;
4476
4477 sector = be64_to_cpu(p->sector);
4478 size = be32_to_cpu(p->blksize);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004479
4480 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4481
4482 dec_rs_pending(mdev);
4483
4484 if (get_ldev_if_state(mdev, D_FAILED)) {
4485 drbd_rs_complete_io(mdev, sector);
Philipp Reisnerd612d302010-12-27 10:53:28 +01004486 switch (be16_to_cpu(h->command)) {
4487 case P_NEG_RS_DREPLY:
4488 drbd_rs_failed_io(mdev, sector, size);
4489 case P_RS_CANCEL:
4490 break;
4491 default:
4492 D_ASSERT(0);
4493 put_ldev(mdev);
4494 return false;
4495 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07004496 put_ldev(mdev);
4497 }
4498
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01004499 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004500}
4501
Philipp Reisner0b70a132010-08-20 13:36:10 +02004502static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004503{
4504 struct p_barrier_ack *p = (struct p_barrier_ack *)h;
4505
4506 tl_release(mdev, p->barrier, be32_to_cpu(p->set_size));
4507
Philipp Reisnerc4752ef2010-10-27 17:32:36 +02004508 if (mdev->state.conn == C_AHEAD &&
4509 atomic_read(&mdev->ap_in_flight) == 0 &&
Philipp Reisnere89868a2011-11-09 21:04:03 +01004510 !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags)) {
Philipp Reisner370a43e2011-01-14 16:03:11 +01004511 mdev->start_resync_timer.expires = jiffies + HZ;
4512 add_timer(&mdev->start_resync_timer);
Philipp Reisnerc4752ef2010-10-27 17:32:36 +02004513 }
4514
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01004515 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004516}
4517
Philipp Reisner0b70a132010-08-20 13:36:10 +02004518static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004519{
4520 struct p_block_ack *p = (struct p_block_ack *)h;
4521 struct drbd_work *w;
4522 sector_t sector;
4523 int size;
4524
4525 sector = be64_to_cpu(p->sector);
4526 size = be32_to_cpu(p->blksize);
4527
4528 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4529
4530 if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
4531 drbd_ov_oos_found(mdev, sector, size);
4532 else
4533 ov_oos_print(mdev);
4534
Lars Ellenberg1d53f092010-09-05 01:13:24 +02004535 if (!get_ldev(mdev))
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01004536 return true;
Lars Ellenberg1d53f092010-09-05 01:13:24 +02004537
Philipp Reisnerb411b362009-09-25 16:07:19 -07004538 drbd_rs_complete_io(mdev, sector);
4539 dec_rs_pending(mdev);
4540
Lars Ellenbergea5442a2010-11-05 09:48:01 +01004541 --mdev->ov_left;
4542
4543 /* let's advance progress step marks only for every other megabyte */
4544 if ((mdev->ov_left & 0x200) == 0x200)
4545 drbd_advance_rs_marks(mdev, mdev->ov_left);
4546
4547 if (mdev->ov_left == 0) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07004548 w = kmalloc(sizeof(*w), GFP_NOIO);
4549 if (w) {
4550 w->cb = w_ov_finished;
4551 drbd_queue_work_front(&mdev->data.work, w);
4552 } else {
4553 dev_err(DEV, "kmalloc(w) failed.");
4554 ov_oos_print(mdev);
4555 drbd_resync_finished(mdev);
4556 }
4557 }
Lars Ellenberg1d53f092010-09-05 01:13:24 +02004558 put_ldev(mdev);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01004559 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004560}
4561
Philipp Reisner02918be2010-08-20 14:35:10 +02004562static int got_skip(struct drbd_conf *mdev, struct p_header80 *h)
Philipp Reisner0ced55a2010-04-30 15:26:20 +02004563{
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01004564 return true;
Philipp Reisner0ced55a2010-04-30 15:26:20 +02004565}
4566
Philipp Reisnerb411b362009-09-25 16:07:19 -07004567struct asender_cmd {
4568 size_t pkt_size;
Philipp Reisner0b70a132010-08-20 13:36:10 +02004569 int (*process)(struct drbd_conf *mdev, struct p_header80 *h);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004570};
4571
4572static struct asender_cmd *get_asender_cmd(int cmd)
4573{
4574 static struct asender_cmd asender_tbl[] = {
4575 /* anything missing from this table is in
4576 * the drbd_cmd_handler (drbd_default_handler) table,
4577 * see the beginning of drbdd() */
Philipp Reisner0b70a132010-08-20 13:36:10 +02004578 [P_PING] = { sizeof(struct p_header80), got_Ping },
4579 [P_PING_ACK] = { sizeof(struct p_header80), got_PingAck },
Philipp Reisnerb411b362009-09-25 16:07:19 -07004580 [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
4581 [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
4582 [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
4583 [P_DISCARD_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
4584 [P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck },
4585 [P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply },
4586 [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply},
4587 [P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult },
4588 [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck },
4589 [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
4590 [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync },
Philipp Reisner02918be2010-08-20 14:35:10 +02004591 [P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_skip },
Philipp Reisnerd612d302010-12-27 10:53:28 +01004592 [P_RS_CANCEL] = { sizeof(struct p_block_ack), got_NegRSDReply},
Philipp Reisnerb411b362009-09-25 16:07:19 -07004593 [P_MAX_CMD] = { 0, NULL },
4594 };
4595 if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
4596 return NULL;
4597 return &asender_tbl[cmd];
4598}
4599
4600int drbd_asender(struct drbd_thread *thi)
4601{
4602 struct drbd_conf *mdev = thi->mdev;
Philipp Reisner02918be2010-08-20 14:35:10 +02004603 struct p_header80 *h = &mdev->meta.rbuf.header.h80;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004604 struct asender_cmd *cmd = NULL;
4605
4606 int rv, len;
4607 void *buf = h;
4608 int received = 0;
Philipp Reisner0b70a132010-08-20 13:36:10 +02004609 int expect = sizeof(struct p_header80);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004610 int empty;
Lars Ellenbergf36af182011-03-09 22:44:55 +01004611 int ping_timeout_active = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004612
4613 sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev));
4614
4615 current->policy = SCHED_RR; /* Make this a realtime task! */
4616 current->rt_priority = 2; /* more important than all other tasks */
4617
4618 while (get_t_state(thi) == Running) {
4619 drbd_thread_current_set_cpu(mdev);
4620 if (test_and_clear_bit(SEND_PING, &mdev->flags)) {
4621 ERR_IF(!drbd_send_ping(mdev)) goto reconnect;
4622 mdev->meta.socket->sk->sk_rcvtimeo =
4623 mdev->net_conf->ping_timeo*HZ/10;
Lars Ellenbergf36af182011-03-09 22:44:55 +01004624 ping_timeout_active = 1;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004625 }
4626
4627 /* conditionally cork;
4628 * it may hurt latency if we cork without much to send */
4629 if (!mdev->net_conf->no_cork &&
4630 3 < atomic_read(&mdev->unacked_cnt))
4631 drbd_tcp_cork(mdev->meta.socket);
4632 while (1) {
4633 clear_bit(SIGNAL_ASENDER, &mdev->flags);
4634 flush_signals(current);
Lars Ellenberg0f8488e2010-10-13 18:19:23 +02004635 if (!drbd_process_done_ee(mdev))
Philipp Reisnerb411b362009-09-25 16:07:19 -07004636 goto reconnect;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004637 /* to avoid race with newly queued ACKs */
4638 set_bit(SIGNAL_ASENDER, &mdev->flags);
4639 spin_lock_irq(&mdev->req_lock);
4640 empty = list_empty(&mdev->done_ee);
4641 spin_unlock_irq(&mdev->req_lock);
4642 /* new ack may have been queued right here,
4643 * but then there is also a signal pending,
4644 * and we start over... */
4645 if (empty)
4646 break;
4647 }
4648 /* but unconditionally uncork unless disabled */
4649 if (!mdev->net_conf->no_cork)
4650 drbd_tcp_uncork(mdev->meta.socket);
4651
4652 /* short circuit, recv_msg would return EINTR anyways. */
4653 if (signal_pending(current))
4654 continue;
4655
4656 rv = drbd_recv_short(mdev, mdev->meta.socket,
4657 buf, expect-received, 0);
4658 clear_bit(SIGNAL_ASENDER, &mdev->flags);
4659
4660 flush_signals(current);
4661
4662 /* Note:
4663 * -EINTR (on meta) we got a signal
4664 * -EAGAIN (on meta) rcvtimeo expired
4665 * -ECONNRESET other side closed the connection
4666 * -ERESTARTSYS (on data) we got a signal
4667 * rv < 0 other than above: unexpected error!
4668 * rv == expected: full header or command
4669 * rv < expected: "woken" by signal during receive
4670 * rv == 0 : "connection shut down by peer"
4671 */
4672 if (likely(rv > 0)) {
4673 received += rv;
4674 buf += rv;
4675 } else if (rv == 0) {
4676 dev_err(DEV, "meta connection shut down by peer.\n");
4677 goto reconnect;
4678 } else if (rv == -EAGAIN) {
Lars Ellenbergcb6518c2011-06-20 14:44:45 +02004679 /* If the data socket received something meanwhile,
4680 * that is good enough: peer is still alive. */
4681 if (time_after(mdev->last_received,
4682 jiffies - mdev->meta.socket->sk->sk_rcvtimeo))
4683 continue;
Lars Ellenbergf36af182011-03-09 22:44:55 +01004684 if (ping_timeout_active) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07004685 dev_err(DEV, "PingAck did not arrive in time.\n");
4686 goto reconnect;
4687 }
4688 set_bit(SEND_PING, &mdev->flags);
4689 continue;
4690 } else if (rv == -EINTR) {
4691 continue;
4692 } else {
4693 dev_err(DEV, "sock_recvmsg returned %d\n", rv);
4694 goto reconnect;
4695 }
4696
4697 if (received == expect && cmd == NULL) {
4698 if (unlikely(h->magic != BE_DRBD_MAGIC)) {
Lars Ellenberg004352f2010-10-05 20:13:58 +02004699 dev_err(DEV, "magic?? on meta m: 0x%08x c: %d l: %d\n",
4700 be32_to_cpu(h->magic),
4701 be16_to_cpu(h->command),
4702 be16_to_cpu(h->length));
Philipp Reisnerb411b362009-09-25 16:07:19 -07004703 goto reconnect;
4704 }
4705 cmd = get_asender_cmd(be16_to_cpu(h->command));
4706 len = be16_to_cpu(h->length);
4707 if (unlikely(cmd == NULL)) {
Lars Ellenberg004352f2010-10-05 20:13:58 +02004708 dev_err(DEV, "unknown command?? on meta m: 0x%08x c: %d l: %d\n",
4709 be32_to_cpu(h->magic),
4710 be16_to_cpu(h->command),
4711 be16_to_cpu(h->length));
Philipp Reisnerb411b362009-09-25 16:07:19 -07004712 goto disconnect;
4713 }
4714 expect = cmd->pkt_size;
Philipp Reisner0b70a132010-08-20 13:36:10 +02004715 ERR_IF(len != expect-sizeof(struct p_header80))
Philipp Reisnerb411b362009-09-25 16:07:19 -07004716 goto reconnect;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004717 }
4718 if (received == expect) {
Lars Ellenbergcb6518c2011-06-20 14:44:45 +02004719 mdev->last_received = jiffies;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004720 D_ASSERT(cmd != NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004721 if (!cmd->process(mdev, h))
4722 goto reconnect;
4723
Lars Ellenbergf36af182011-03-09 22:44:55 +01004724 /* the idle_timeout (ping-int)
4725 * has been restored in got_PingAck() */
4726 if (cmd == get_asender_cmd(P_PING_ACK))
4727 ping_timeout_active = 0;
4728
Philipp Reisnerb411b362009-09-25 16:07:19 -07004729 buf = h;
4730 received = 0;
Philipp Reisner0b70a132010-08-20 13:36:10 +02004731 expect = sizeof(struct p_header80);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004732 cmd = NULL;
4733 }
4734 }
4735
4736 if (0) {
4737reconnect:
4738 drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
Lars Ellenberg856c50c2010-10-14 13:37:40 +02004739 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004740 }
4741 if (0) {
4742disconnect:
4743 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
Lars Ellenberg856c50c2010-10-14 13:37:40 +02004744 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004745 }
4746 clear_bit(SIGNAL_ASENDER, &mdev->flags);
4747
4748 D_ASSERT(mdev->state.conn < C_CONNECTED);
4749 dev_info(DEV, "asender terminated\n");
4750
4751 return 0;
4752}