blob: 9e9fc341313761e20a9ecacf765b8a2230ebb969 [file] [log] [blame]
Philipp Reisnerb411b362009-09-25 16:07:19 -07001/*
2 drbd_receiver.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23 */
24
25
Philipp Reisnerb411b362009-09-25 16:07:19 -070026#include <linux/module.h>
27
28#include <asm/uaccess.h>
29#include <net/sock.h>
30
Philipp Reisnerb411b362009-09-25 16:07:19 -070031#include <linux/drbd.h>
32#include <linux/fs.h>
33#include <linux/file.h>
34#include <linux/in.h>
35#include <linux/mm.h>
36#include <linux/memcontrol.h>
37#include <linux/mm_inline.h>
38#include <linux/slab.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070039#include <linux/pkt_sched.h>
40#define __KERNEL_SYSCALLS__
41#include <linux/unistd.h>
42#include <linux/vmalloc.h>
43#include <linux/random.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070044#include <linux/string.h>
45#include <linux/scatterlist.h>
46#include "drbd_int.h"
Philipp Reisnerb411b362009-09-25 16:07:19 -070047#include "drbd_req.h"
48
49#include "drbd_vli.h"
50
Philipp Reisnerb411b362009-09-25 16:07:19 -070051enum finish_epoch {
52 FE_STILL_LIVE,
53 FE_DESTROYED,
54 FE_RECYCLED,
55};
56
57static int drbd_do_handshake(struct drbd_conf *mdev);
58static int drbd_do_auth(struct drbd_conf *mdev);
59
60static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event);
61static int e_end_block(struct drbd_conf *, struct drbd_work *, int);
62
Philipp Reisnerb411b362009-09-25 16:07:19 -070063
64#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
65
Lars Ellenberg45bb9122010-05-14 17:10:48 +020066/*
67 * some helper functions to deal with single linked page lists,
68 * page->private being our "next" pointer.
69 */
70
71/* If at least n pages are linked at head, get n pages off.
72 * Otherwise, don't modify head, and return NULL.
73 * Locking is the responsibility of the caller.
74 */
75static struct page *page_chain_del(struct page **head, int n)
76{
77 struct page *page;
78 struct page *tmp;
79
80 BUG_ON(!n);
81 BUG_ON(!head);
82
83 page = *head;
Philipp Reisner23ce4222010-05-20 13:35:31 +020084
85 if (!page)
86 return NULL;
87
Lars Ellenberg45bb9122010-05-14 17:10:48 +020088 while (page) {
89 tmp = page_chain_next(page);
90 if (--n == 0)
91 break; /* found sufficient pages */
92 if (tmp == NULL)
93 /* insufficient pages, don't use any of them. */
94 return NULL;
95 page = tmp;
96 }
97
98 /* add end of list marker for the returned list */
99 set_page_private(page, 0);
100 /* actual return value, and adjustment of head */
101 page = *head;
102 *head = tmp;
103 return page;
104}
105
106/* may be used outside of locks to find the tail of a (usually short)
107 * "private" page chain, before adding it back to a global chain head
108 * with page_chain_add() under a spinlock. */
109static struct page *page_chain_tail(struct page *page, int *len)
110{
111 struct page *tmp;
112 int i = 1;
113 while ((tmp = page_chain_next(page)))
114 ++i, page = tmp;
115 if (len)
116 *len = i;
117 return page;
118}
119
120static int page_chain_free(struct page *page)
121{
122 struct page *tmp;
123 int i = 0;
124 page_chain_for_each_safe(page, tmp) {
125 put_page(page);
126 ++i;
127 }
128 return i;
129}
130
131static void page_chain_add(struct page **head,
132 struct page *chain_first, struct page *chain_last)
133{
134#if 1
135 struct page *tmp;
136 tmp = page_chain_tail(chain_first, NULL);
137 BUG_ON(tmp != chain_last);
138#endif
139
140 /* add chain to head */
141 set_page_private(chain_last, (unsigned long)*head);
142 *head = chain_first;
143}
144
145static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int number)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700146{
147 struct page *page = NULL;
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200148 struct page *tmp = NULL;
149 int i = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700150
151 /* Yes, testing drbd_pp_vacant outside the lock is racy.
152 * So what. It saves a spin_lock. */
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200153 if (drbd_pp_vacant >= number) {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700154 spin_lock(&drbd_pp_lock);
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200155 page = page_chain_del(&drbd_pp_pool, number);
156 if (page)
157 drbd_pp_vacant -= number;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700158 spin_unlock(&drbd_pp_lock);
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200159 if (page)
160 return page;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700161 }
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200162
Philipp Reisnerb411b362009-09-25 16:07:19 -0700163 /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
164 * "criss-cross" setup, that might cause write-out on some other DRBD,
165 * which in turn might block on the other node at this very place. */
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200166 for (i = 0; i < number; i++) {
167 tmp = alloc_page(GFP_TRY);
168 if (!tmp)
169 break;
170 set_page_private(tmp, (unsigned long)page);
171 page = tmp;
172 }
173
174 if (i == number)
175 return page;
176
177 /* Not enough pages immediately available this time.
178 * No need to jump around here, drbd_pp_alloc will retry this
179 * function "soon". */
180 if (page) {
181 tmp = page_chain_tail(page, NULL);
182 spin_lock(&drbd_pp_lock);
183 page_chain_add(&drbd_pp_pool, page, tmp);
184 drbd_pp_vacant += i;
185 spin_unlock(&drbd_pp_lock);
186 }
187 return NULL;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700188}
189
Philipp Reisnerb411b362009-09-25 16:07:19 -0700190static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed)
191{
192 struct drbd_epoch_entry *e;
193 struct list_head *le, *tle;
194
195 /* The EEs are always appended to the end of the list. Since
196 they are sent in order over the wire, they have to finish
197 in order. As soon as we see the first not finished we can
198 stop to examine the list... */
199
200 list_for_each_safe(le, tle, &mdev->net_ee) {
201 e = list_entry(le, struct drbd_epoch_entry, w.list);
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200202 if (drbd_ee_has_active_page(e))
Philipp Reisnerb411b362009-09-25 16:07:19 -0700203 break;
204 list_move(le, to_be_freed);
205 }
206}
207
208static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
209{
210 LIST_HEAD(reclaimed);
211 struct drbd_epoch_entry *e, *t;
212
Philipp Reisnerb411b362009-09-25 16:07:19 -0700213 spin_lock_irq(&mdev->req_lock);
214 reclaim_net_ee(mdev, &reclaimed);
215 spin_unlock_irq(&mdev->req_lock);
216
217 list_for_each_entry_safe(e, t, &reclaimed, w.list)
Lars Ellenberg435f0742010-09-06 12:30:25 +0200218 drbd_free_net_ee(mdev, e);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700219}
220
221/**
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200222 * drbd_pp_alloc() - Returns @number pages, retries forever (or until signalled)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700223 * @mdev: DRBD device.
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200224 * @number: number of pages requested
225 * @retry: whether to retry, if not enough pages are available right now
Philipp Reisnerb411b362009-09-25 16:07:19 -0700226 *
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200227 * Tries to allocate number pages, first from our own page pool, then from
228 * the kernel, unless this allocation would exceed the max_buffers setting.
229 * Possibly retry until DRBD frees sufficient pages somewhere else.
230 *
231 * Returns a page chain linked via page->private.
Philipp Reisnerb411b362009-09-25 16:07:19 -0700232 */
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200233static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool retry)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700234{
235 struct page *page = NULL;
236 DEFINE_WAIT(wait);
237
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200238 /* Yes, we may run up to @number over max_buffers. If we
239 * follow it strictly, the admin will get it wrong anyways. */
240 if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers)
241 page = drbd_pp_first_pages_or_try_alloc(mdev, number);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700242
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200243 while (page == NULL) {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700244 prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
245
246 drbd_kick_lo_and_reclaim_net(mdev);
247
248 if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) {
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200249 page = drbd_pp_first_pages_or_try_alloc(mdev, number);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700250 if (page)
251 break;
252 }
253
254 if (!retry)
255 break;
256
257 if (signal_pending(current)) {
258 dev_warn(DEV, "drbd_pp_alloc interrupted!\n");
259 break;
260 }
261
262 schedule();
263 }
264 finish_wait(&drbd_pp_wait, &wait);
265
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200266 if (page)
267 atomic_add(number, &mdev->pp_in_use);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700268 return page;
269}
270
271/* Must not be used from irq, as that may deadlock: see drbd_pp_alloc.
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200272 * Is also used from inside an other spin_lock_irq(&mdev->req_lock);
273 * Either links the page chain back to the global pool,
274 * or returns all pages to the system. */
Lars Ellenberg435f0742010-09-06 12:30:25 +0200275static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700276{
Lars Ellenberg435f0742010-09-06 12:30:25 +0200277 atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700278 int i;
Lars Ellenberg435f0742010-09-06 12:30:25 +0200279
Lars Ellenberg1816a2b2010-11-11 15:19:07 +0100280 if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE)*minor_count)
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200281 i = page_chain_free(page);
282 else {
283 struct page *tmp;
284 tmp = page_chain_tail(page, &i);
285 spin_lock(&drbd_pp_lock);
286 page_chain_add(&drbd_pp_pool, page, tmp);
287 drbd_pp_vacant += i;
288 spin_unlock(&drbd_pp_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700289 }
Lars Ellenberg435f0742010-09-06 12:30:25 +0200290 i = atomic_sub_return(i, a);
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200291 if (i < 0)
Lars Ellenberg435f0742010-09-06 12:30:25 +0200292 dev_warn(DEV, "ASSERTION FAILED: %s: %d < 0\n",
293 is_net ? "pp_in_use_by_net" : "pp_in_use", i);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700294 wake_up(&drbd_pp_wait);
295}
296
297/*
298You need to hold the req_lock:
299 _drbd_wait_ee_list_empty()
300
301You must not have the req_lock:
302 drbd_free_ee()
303 drbd_alloc_ee()
304 drbd_init_ee()
305 drbd_release_ee()
306 drbd_ee_fix_bhs()
307 drbd_process_done_ee()
308 drbd_clear_done_ee()
309 drbd_wait_ee_list_empty()
310*/
311
312struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
313 u64 id,
314 sector_t sector,
315 unsigned int data_size,
316 gfp_t gfp_mask) __must_hold(local)
317{
Philipp Reisnerb411b362009-09-25 16:07:19 -0700318 struct drbd_epoch_entry *e;
319 struct page *page;
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200320 unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700321
Andreas Gruenbacher0cf9d272010-12-07 10:43:29 +0100322 if (drbd_insert_fault(mdev, DRBD_FAULT_AL_EE))
Philipp Reisnerb411b362009-09-25 16:07:19 -0700323 return NULL;
324
325 e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
326 if (!e) {
327 if (!(gfp_mask & __GFP_NOWARN))
328 dev_err(DEV, "alloc_ee: Allocation of an EE failed\n");
329 return NULL;
330 }
331
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200332 page = drbd_pp_alloc(mdev, nr_pages, (gfp_mask & __GFP_WAIT));
333 if (!page)
334 goto fail;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700335
Philipp Reisnerb411b362009-09-25 16:07:19 -0700336 INIT_HLIST_NODE(&e->colision);
337 e->epoch = NULL;
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200338 e->mdev = mdev;
339 e->pages = page;
340 atomic_set(&e->pending_bios, 0);
341 e->size = data_size;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700342 e->flags = 0;
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200343 e->sector = sector;
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200344 e->block_id = id;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700345
Philipp Reisnerb411b362009-09-25 16:07:19 -0700346 return e;
347
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200348 fail:
Philipp Reisnerb411b362009-09-25 16:07:19 -0700349 mempool_free(e, drbd_ee_mempool);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700350 return NULL;
351}
352
Lars Ellenberg435f0742010-09-06 12:30:25 +0200353void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, int is_net)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700354{
Lars Ellenbergc36c3ce2010-08-11 20:42:55 +0200355 if (e->flags & EE_HAS_DIGEST)
356 kfree(e->digest);
Lars Ellenberg435f0742010-09-06 12:30:25 +0200357 drbd_pp_free(mdev, e->pages, is_net);
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200358 D_ASSERT(atomic_read(&e->pending_bios) == 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700359 D_ASSERT(hlist_unhashed(&e->colision));
360 mempool_free(e, drbd_ee_mempool);
361}
362
363int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list)
364{
365 LIST_HEAD(work_list);
366 struct drbd_epoch_entry *e, *t;
367 int count = 0;
Lars Ellenberg435f0742010-09-06 12:30:25 +0200368 int is_net = list == &mdev->net_ee;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700369
370 spin_lock_irq(&mdev->req_lock);
371 list_splice_init(list, &work_list);
372 spin_unlock_irq(&mdev->req_lock);
373
374 list_for_each_entry_safe(e, t, &work_list, w.list) {
Lars Ellenberg435f0742010-09-06 12:30:25 +0200375 drbd_free_some_ee(mdev, e, is_net);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700376 count++;
377 }
378 return count;
379}
380
381
382/*
383 * This function is called from _asender only_
384 * but see also comments in _req_mod(,barrier_acked)
385 * and receive_Barrier.
386 *
387 * Move entries from net_ee to done_ee, if ready.
388 * Grab done_ee, call all callbacks, free the entries.
389 * The callbacks typically send out ACKs.
390 */
391static int drbd_process_done_ee(struct drbd_conf *mdev)
392{
393 LIST_HEAD(work_list);
394 LIST_HEAD(reclaimed);
395 struct drbd_epoch_entry *e, *t;
396 int ok = (mdev->state.conn >= C_WF_REPORT_PARAMS);
397
398 spin_lock_irq(&mdev->req_lock);
399 reclaim_net_ee(mdev, &reclaimed);
400 list_splice_init(&mdev->done_ee, &work_list);
401 spin_unlock_irq(&mdev->req_lock);
402
403 list_for_each_entry_safe(e, t, &reclaimed, w.list)
Lars Ellenberg435f0742010-09-06 12:30:25 +0200404 drbd_free_net_ee(mdev, e);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700405
406 /* possible callbacks here:
407 * e_end_block, and e_end_resync_block, e_send_discard_ack.
408 * all ignore the last argument.
409 */
410 list_for_each_entry_safe(e, t, &work_list, w.list) {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700411 /* list_del not necessary, next/prev members not touched */
412 ok = e->w.cb(mdev, &e->w, !ok) && ok;
413 drbd_free_ee(mdev, e);
414 }
415 wake_up(&mdev->ee_wait);
416
417 return ok;
418}
419
420void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
421{
422 DEFINE_WAIT(wait);
423
424 /* avoids spin_lock/unlock
425 * and calling prepare_to_wait in the fast path */
426 while (!list_empty(head)) {
427 prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
428 spin_unlock_irq(&mdev->req_lock);
Jens Axboe7eaceac2011-03-10 08:52:07 +0100429 io_schedule();
Philipp Reisnerb411b362009-09-25 16:07:19 -0700430 finish_wait(&mdev->ee_wait, &wait);
431 spin_lock_irq(&mdev->req_lock);
432 }
433}
434
435void drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
436{
437 spin_lock_irq(&mdev->req_lock);
438 _drbd_wait_ee_list_empty(mdev, head);
439 spin_unlock_irq(&mdev->req_lock);
440}
441
442/* see also kernel_accept; which is only present since 2.6.18.
443 * also we want to log which part of it failed, exactly */
444static int drbd_accept(struct drbd_conf *mdev, const char **what,
445 struct socket *sock, struct socket **newsock)
446{
447 struct sock *sk = sock->sk;
448 int err = 0;
449
450 *what = "listen";
451 err = sock->ops->listen(sock, 5);
452 if (err < 0)
453 goto out;
454
455 *what = "sock_create_lite";
456 err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol,
457 newsock);
458 if (err < 0)
459 goto out;
460
461 *what = "accept";
462 err = sock->ops->accept(sock, *newsock, 0);
463 if (err < 0) {
464 sock_release(*newsock);
465 *newsock = NULL;
466 goto out;
467 }
468 (*newsock)->ops = sock->ops;
469
470out:
471 return err;
472}
473
474static int drbd_recv_short(struct drbd_conf *mdev, struct socket *sock,
475 void *buf, size_t size, int flags)
476{
477 mm_segment_t oldfs;
478 struct kvec iov = {
479 .iov_base = buf,
480 .iov_len = size,
481 };
482 struct msghdr msg = {
483 .msg_iovlen = 1,
484 .msg_iov = (struct iovec *)&iov,
485 .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL)
486 };
487 int rv;
488
489 oldfs = get_fs();
490 set_fs(KERNEL_DS);
491 rv = sock_recvmsg(sock, &msg, size, msg.msg_flags);
492 set_fs(oldfs);
493
494 return rv;
495}
496
497static int drbd_recv(struct drbd_conf *mdev, void *buf, size_t size)
498{
499 mm_segment_t oldfs;
500 struct kvec iov = {
501 .iov_base = buf,
502 .iov_len = size,
503 };
504 struct msghdr msg = {
505 .msg_iovlen = 1,
506 .msg_iov = (struct iovec *)&iov,
507 .msg_flags = MSG_WAITALL | MSG_NOSIGNAL
508 };
509 int rv;
510
511 oldfs = get_fs();
512 set_fs(KERNEL_DS);
513
514 for (;;) {
515 rv = sock_recvmsg(mdev->data.socket, &msg, size, msg.msg_flags);
516 if (rv == size)
517 break;
518
519 /* Note:
520 * ECONNRESET other side closed the connection
521 * ERESTARTSYS (on sock) we got a signal
522 */
523
524 if (rv < 0) {
525 if (rv == -ECONNRESET)
526 dev_info(DEV, "sock was reset by peer\n");
527 else if (rv != -ERESTARTSYS)
528 dev_err(DEV, "sock_recvmsg returned %d\n", rv);
529 break;
530 } else if (rv == 0) {
531 dev_info(DEV, "sock was shut down by peer\n");
532 break;
533 } else {
534 /* signal came in, or peer/link went down,
535 * after we read a partial message
536 */
537 /* D_ASSERT(signal_pending(current)); */
538 break;
539 }
540 };
541
542 set_fs(oldfs);
543
544 if (rv != size)
545 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
546
547 return rv;
548}
549
Lars Ellenberg5dbf1672010-05-25 16:18:01 +0200550/* quoting tcp(7):
551 * On individual connections, the socket buffer size must be set prior to the
552 * listen(2) or connect(2) calls in order to have it take effect.
553 * This is our wrapper to do so.
554 */
555static void drbd_setbufsize(struct socket *sock, unsigned int snd,
556 unsigned int rcv)
557{
558 /* open coded SO_SNDBUF, SO_RCVBUF */
559 if (snd) {
560 sock->sk->sk_sndbuf = snd;
561 sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
562 }
563 if (rcv) {
564 sock->sk->sk_rcvbuf = rcv;
565 sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
566 }
567}
568
Philipp Reisnerb411b362009-09-25 16:07:19 -0700569static struct socket *drbd_try_connect(struct drbd_conf *mdev)
570{
571 const char *what;
572 struct socket *sock;
573 struct sockaddr_in6 src_in6;
574 int err;
575 int disconnect_on_error = 1;
576
577 if (!get_net_conf(mdev))
578 return NULL;
579
580 what = "sock_create_kern";
581 err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
582 SOCK_STREAM, IPPROTO_TCP, &sock);
583 if (err < 0) {
584 sock = NULL;
585 goto out;
586 }
587
588 sock->sk->sk_rcvtimeo =
589 sock->sk->sk_sndtimeo = mdev->net_conf->try_connect_int*HZ;
Lars Ellenberg5dbf1672010-05-25 16:18:01 +0200590 drbd_setbufsize(sock, mdev->net_conf->sndbuf_size,
591 mdev->net_conf->rcvbuf_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700592
593 /* explicitly bind to the configured IP as source IP
594 * for the outgoing connections.
595 * This is needed for multihomed hosts and to be
596 * able to use lo: interfaces for drbd.
597 * Make sure to use 0 as port number, so linux selects
598 * a free one dynamically.
599 */
600 memcpy(&src_in6, mdev->net_conf->my_addr,
601 min_t(int, mdev->net_conf->my_addr_len, sizeof(src_in6)));
602 if (((struct sockaddr *)mdev->net_conf->my_addr)->sa_family == AF_INET6)
603 src_in6.sin6_port = 0;
604 else
605 ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
606
607 what = "bind before connect";
608 err = sock->ops->bind(sock,
609 (struct sockaddr *) &src_in6,
610 mdev->net_conf->my_addr_len);
611 if (err < 0)
612 goto out;
613
614 /* connect may fail, peer not yet available.
615 * stay C_WF_CONNECTION, don't go Disconnecting! */
616 disconnect_on_error = 0;
617 what = "connect";
618 err = sock->ops->connect(sock,
619 (struct sockaddr *)mdev->net_conf->peer_addr,
620 mdev->net_conf->peer_addr_len, 0);
621
622out:
623 if (err < 0) {
624 if (sock) {
625 sock_release(sock);
626 sock = NULL;
627 }
628 switch (-err) {
629 /* timeout, busy, signal pending */
630 case ETIMEDOUT: case EAGAIN: case EINPROGRESS:
631 case EINTR: case ERESTARTSYS:
632 /* peer not (yet) available, network problem */
633 case ECONNREFUSED: case ENETUNREACH:
634 case EHOSTDOWN: case EHOSTUNREACH:
635 disconnect_on_error = 0;
636 break;
637 default:
638 dev_err(DEV, "%s failed, err = %d\n", what, err);
639 }
640 if (disconnect_on_error)
641 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
642 }
643 put_net_conf(mdev);
644 return sock;
645}
646
647static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev)
648{
649 int timeo, err;
650 struct socket *s_estab = NULL, *s_listen;
651 const char *what;
652
653 if (!get_net_conf(mdev))
654 return NULL;
655
656 what = "sock_create_kern";
657 err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
658 SOCK_STREAM, IPPROTO_TCP, &s_listen);
659 if (err) {
660 s_listen = NULL;
661 goto out;
662 }
663
664 timeo = mdev->net_conf->try_connect_int * HZ;
665 timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */
666
667 s_listen->sk->sk_reuse = 1; /* SO_REUSEADDR */
668 s_listen->sk->sk_rcvtimeo = timeo;
669 s_listen->sk->sk_sndtimeo = timeo;
Lars Ellenberg5dbf1672010-05-25 16:18:01 +0200670 drbd_setbufsize(s_listen, mdev->net_conf->sndbuf_size,
671 mdev->net_conf->rcvbuf_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700672
673 what = "bind before listen";
674 err = s_listen->ops->bind(s_listen,
675 (struct sockaddr *) mdev->net_conf->my_addr,
676 mdev->net_conf->my_addr_len);
677 if (err < 0)
678 goto out;
679
680 err = drbd_accept(mdev, &what, s_listen, &s_estab);
681
682out:
683 if (s_listen)
684 sock_release(s_listen);
685 if (err < 0) {
686 if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
687 dev_err(DEV, "%s failed, err = %d\n", what, err);
688 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
689 }
690 }
691 put_net_conf(mdev);
692
693 return s_estab;
694}
695
696static int drbd_send_fp(struct drbd_conf *mdev,
697 struct socket *sock, enum drbd_packets cmd)
698{
Philipp Reisner02918be2010-08-20 14:35:10 +0200699 struct p_header80 *h = &mdev->data.sbuf.header.h80;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700700
701 return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0);
702}
703
704static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock)
705{
Philipp Reisner02918be2010-08-20 14:35:10 +0200706 struct p_header80 *h = &mdev->data.rbuf.header.h80;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700707 int rr;
708
709 rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0);
710
711 if (rr == sizeof(*h) && h->magic == BE_DRBD_MAGIC)
712 return be16_to_cpu(h->command);
713
714 return 0xffff;
715}
716
717/**
718 * drbd_socket_okay() - Free the socket if its connection is not okay
719 * @mdev: DRBD device.
720 * @sock: pointer to the pointer to the socket.
721 */
722static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock)
723{
724 int rr;
725 char tb[4];
726
727 if (!*sock)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +0100728 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700729
730 rr = drbd_recv_short(mdev, *sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
731
732 if (rr > 0 || rr == -EAGAIN) {
Andreas Gruenbacher81e84652010-12-09 15:03:57 +0100733 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700734 } else {
735 sock_release(*sock);
736 *sock = NULL;
Andreas Gruenbacher81e84652010-12-09 15:03:57 +0100737 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700738 }
739}
740
741/*
742 * return values:
743 * 1 yes, we have a valid connection
744 * 0 oops, did not work out, please try again
745 * -1 peer talks different language,
746 * no point in trying again, please go standalone.
747 * -2 We do not have a network config...
748 */
749static int drbd_connect(struct drbd_conf *mdev)
750{
751 struct socket *s, *sock, *msock;
752 int try, h, ok;
753
754 D_ASSERT(!mdev->data.socket);
755
Philipp Reisnerb411b362009-09-25 16:07:19 -0700756 if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS)
757 return -2;
758
759 clear_bit(DISCARD_CONCURRENT, &mdev->flags);
760
761 sock = NULL;
762 msock = NULL;
763
764 do {
765 for (try = 0;;) {
766 /* 3 tries, this should take less than a second! */
767 s = drbd_try_connect(mdev);
768 if (s || ++try >= 3)
769 break;
770 /* give the other side time to call bind() & listen() */
Philipp Reisner20ee6392011-01-18 15:28:59 +0100771 schedule_timeout_interruptible(HZ / 10);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700772 }
773
774 if (s) {
775 if (!sock) {
776 drbd_send_fp(mdev, s, P_HAND_SHAKE_S);
777 sock = s;
778 s = NULL;
779 } else if (!msock) {
780 drbd_send_fp(mdev, s, P_HAND_SHAKE_M);
781 msock = s;
782 s = NULL;
783 } else {
784 dev_err(DEV, "Logic error in drbd_connect()\n");
785 goto out_release_sockets;
786 }
787 }
788
789 if (sock && msock) {
Philipp Reisner20ee6392011-01-18 15:28:59 +0100790 schedule_timeout_interruptible(HZ / 10);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700791 ok = drbd_socket_okay(mdev, &sock);
792 ok = drbd_socket_okay(mdev, &msock) && ok;
793 if (ok)
794 break;
795 }
796
797retry:
798 s = drbd_wait_for_connect(mdev);
799 if (s) {
800 try = drbd_recv_fp(mdev, s);
801 drbd_socket_okay(mdev, &sock);
802 drbd_socket_okay(mdev, &msock);
803 switch (try) {
804 case P_HAND_SHAKE_S:
805 if (sock) {
806 dev_warn(DEV, "initial packet S crossed\n");
807 sock_release(sock);
808 }
809 sock = s;
810 break;
811 case P_HAND_SHAKE_M:
812 if (msock) {
813 dev_warn(DEV, "initial packet M crossed\n");
814 sock_release(msock);
815 }
816 msock = s;
817 set_bit(DISCARD_CONCURRENT, &mdev->flags);
818 break;
819 default:
820 dev_warn(DEV, "Error receiving initial packet\n");
821 sock_release(s);
822 if (random32() & 1)
823 goto retry;
824 }
825 }
826
827 if (mdev->state.conn <= C_DISCONNECTING)
828 goto out_release_sockets;
829 if (signal_pending(current)) {
830 flush_signals(current);
831 smp_rmb();
832 if (get_t_state(&mdev->receiver) == Exiting)
833 goto out_release_sockets;
834 }
835
836 if (sock && msock) {
837 ok = drbd_socket_okay(mdev, &sock);
838 ok = drbd_socket_okay(mdev, &msock) && ok;
839 if (ok)
840 break;
841 }
842 } while (1);
843
844 msock->sk->sk_reuse = 1; /* SO_REUSEADDR */
845 sock->sk->sk_reuse = 1; /* SO_REUSEADDR */
846
847 sock->sk->sk_allocation = GFP_NOIO;
848 msock->sk->sk_allocation = GFP_NOIO;
849
850 sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
851 msock->sk->sk_priority = TC_PRIO_INTERACTIVE;
852
Philipp Reisnerb411b362009-09-25 16:07:19 -0700853 /* NOT YET ...
854 * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
855 * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
856 * first set it to the P_HAND_SHAKE timeout,
857 * which we set to 4x the configured ping_timeout. */
858 sock->sk->sk_sndtimeo =
859 sock->sk->sk_rcvtimeo = mdev->net_conf->ping_timeo*4*HZ/10;
860
861 msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
862 msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
863
864 /* we don't want delays.
865 * we use TCP_CORK where apropriate, though */
866 drbd_tcp_nodelay(sock);
867 drbd_tcp_nodelay(msock);
868
869 mdev->data.socket = sock;
870 mdev->meta.socket = msock;
871 mdev->last_received = jiffies;
872
873 D_ASSERT(mdev->asender.task == NULL);
874
875 h = drbd_do_handshake(mdev);
876 if (h <= 0)
877 return h;
878
879 if (mdev->cram_hmac_tfm) {
880 /* drbd_request_state(mdev, NS(conn, WFAuth)); */
Johannes Thomab10d96c2010-01-07 16:02:50 +0100881 switch (drbd_do_auth(mdev)) {
882 case -1:
Philipp Reisnerb411b362009-09-25 16:07:19 -0700883 dev_err(DEV, "Authentication of peer failed\n");
884 return -1;
Johannes Thomab10d96c2010-01-07 16:02:50 +0100885 case 0:
886 dev_err(DEV, "Authentication of peer failed, trying again.\n");
887 return 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700888 }
889 }
890
891 if (drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS)) < SS_SUCCESS)
892 return 0;
893
894 sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
895 sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
896
897 atomic_set(&mdev->packet_seq, 0);
898 mdev->peer_seq = 0;
899
900 drbd_thread_start(&mdev->asender);
901
Philipp Reisnerd5373382010-08-23 15:18:33 +0200902 if (mdev->agreed_pro_version < 95 && get_ldev(mdev)) {
903 drbd_setup_queue_param(mdev, DRBD_MAX_SIZE_H80_PACKET);
904 put_ldev(mdev);
905 }
906
Philipp Reisner148efa12011-01-15 00:21:15 +0100907 if (drbd_send_protocol(mdev) == -1)
Philipp Reisner7e2455c2010-04-22 14:50:23 +0200908 return -1;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700909 drbd_send_sync_param(mdev, &mdev->sync_conf);
Philipp Reisnere89b5912010-03-24 17:11:33 +0100910 drbd_send_sizes(mdev, 0, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700911 drbd_send_uuids(mdev);
912 drbd_send_state(mdev);
913 clear_bit(USE_DEGR_WFC_T, &mdev->flags);
914 clear_bit(RESIZE_PENDING, &mdev->flags);
915
916 return 1;
917
918out_release_sockets:
919 if (sock)
920 sock_release(sock);
921 if (msock)
922 sock_release(msock);
923 return -1;
924}
925
Philipp Reisner02918be2010-08-20 14:35:10 +0200926static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsigned int *packet_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700927{
Philipp Reisner02918be2010-08-20 14:35:10 +0200928 union p_header *h = &mdev->data.rbuf.header;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700929 int r;
930
931 r = drbd_recv(mdev, h, sizeof(*h));
Philipp Reisnerb411b362009-09-25 16:07:19 -0700932 if (unlikely(r != sizeof(*h))) {
Lars Ellenberg0ddc5542011-01-21 12:35:15 +0100933 if (!signal_pending(current))
934 dev_warn(DEV, "short read expecting header on sock: r=%d\n", r);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +0100935 return false;
Philipp Reisner02918be2010-08-20 14:35:10 +0200936 }
937
938 if (likely(h->h80.magic == BE_DRBD_MAGIC)) {
939 *cmd = be16_to_cpu(h->h80.command);
940 *packet_size = be16_to_cpu(h->h80.length);
941 } else if (h->h95.magic == BE_DRBD_MAGIC_BIG) {
942 *cmd = be16_to_cpu(h->h95.command);
943 *packet_size = be32_to_cpu(h->h95.length);
944 } else {
Lars Ellenberg004352f2010-10-05 20:13:58 +0200945 dev_err(DEV, "magic?? on data m: 0x%08x c: %d l: %d\n",
946 be32_to_cpu(h->h80.magic),
947 be16_to_cpu(h->h80.command),
948 be16_to_cpu(h->h80.length));
Andreas Gruenbacher81e84652010-12-09 15:03:57 +0100949 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700950 }
951 mdev->last_received = jiffies;
952
Andreas Gruenbacher81e84652010-12-09 15:03:57 +0100953 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700954}
955
Philipp Reisner2451fc32010-08-24 13:43:11 +0200956static void drbd_flush(struct drbd_conf *mdev)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700957{
958 int rv;
959
960 if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
Dmitry Monakhovfbd9b092010-04-28 17:55:06 +0400961 rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL,
Christoph Hellwigdd3932e2010-09-16 20:51:46 +0200962 NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700963 if (rv) {
964 dev_err(DEV, "local disk flush failed with status %d\n", rv);
965 /* would rather check on EOPNOTSUPP, but that is not reliable.
966 * don't try again for ANY return value != 0
967 * if (rv == -EOPNOTSUPP) */
968 drbd_bump_write_ordering(mdev, WO_drain_io);
969 }
970 put_ldev(mdev);
971 }
Philipp Reisnerb411b362009-09-25 16:07:19 -0700972}
973
974/**
975 * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it.
976 * @mdev: DRBD device.
977 * @epoch: Epoch object.
978 * @ev: Epoch event.
979 */
980static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
981 struct drbd_epoch *epoch,
982 enum epoch_event ev)
983{
Philipp Reisner2451fc32010-08-24 13:43:11 +0200984 int epoch_size;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700985 struct drbd_epoch *next_epoch;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700986 enum finish_epoch rv = FE_STILL_LIVE;
987
988 spin_lock(&mdev->epoch_lock);
989 do {
990 next_epoch = NULL;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700991
992 epoch_size = atomic_read(&epoch->epoch_size);
993
994 switch (ev & ~EV_CLEANUP) {
995 case EV_PUT:
996 atomic_dec(&epoch->active);
997 break;
998 case EV_GOT_BARRIER_NR:
999 set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001000 break;
1001 case EV_BECAME_LAST:
1002 /* nothing to do*/
1003 break;
1004 }
1005
Philipp Reisnerb411b362009-09-25 16:07:19 -07001006 if (epoch_size != 0 &&
1007 atomic_read(&epoch->active) == 0 &&
Philipp Reisner2451fc32010-08-24 13:43:11 +02001008 test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001009 if (!(ev & EV_CLEANUP)) {
1010 spin_unlock(&mdev->epoch_lock);
1011 drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size);
1012 spin_lock(&mdev->epoch_lock);
1013 }
1014 dec_unacked(mdev);
1015
1016 if (mdev->current_epoch != epoch) {
1017 next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
1018 list_del(&epoch->list);
1019 ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
1020 mdev->epochs--;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001021 kfree(epoch);
1022
1023 if (rv == FE_STILL_LIVE)
1024 rv = FE_DESTROYED;
1025 } else {
1026 epoch->flags = 0;
1027 atomic_set(&epoch->epoch_size, 0);
Uwe Kleine-König698f9312010-07-02 20:41:51 +02001028 /* atomic_set(&epoch->active, 0); is already zero */
Philipp Reisnerb411b362009-09-25 16:07:19 -07001029 if (rv == FE_STILL_LIVE)
1030 rv = FE_RECYCLED;
Philipp Reisner2451fc32010-08-24 13:43:11 +02001031 wake_up(&mdev->ee_wait);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001032 }
1033 }
1034
1035 if (!next_epoch)
1036 break;
1037
1038 epoch = next_epoch;
1039 } while (1);
1040
1041 spin_unlock(&mdev->epoch_lock);
1042
Philipp Reisnerb411b362009-09-25 16:07:19 -07001043 return rv;
1044}
1045
1046/**
1047 * drbd_bump_write_ordering() - Fall back to an other write ordering method
1048 * @mdev: DRBD device.
1049 * @wo: Write ordering method to try.
1050 */
1051void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) __must_hold(local)
1052{
1053 enum write_ordering_e pwo;
1054 static char *write_ordering_str[] = {
1055 [WO_none] = "none",
1056 [WO_drain_io] = "drain",
1057 [WO_bdev_flush] = "flush",
Philipp Reisnerb411b362009-09-25 16:07:19 -07001058 };
1059
1060 pwo = mdev->write_ordering;
1061 wo = min(pwo, wo);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001062 if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush)
1063 wo = WO_drain_io;
1064 if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain)
1065 wo = WO_none;
1066 mdev->write_ordering = wo;
Philipp Reisner2451fc32010-08-24 13:43:11 +02001067 if (pwo != mdev->write_ordering || wo == WO_bdev_flush)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001068 dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]);
1069}
1070
1071/**
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001072 * drbd_submit_ee()
1073 * @mdev: DRBD device.
1074 * @e: epoch entry
1075 * @rw: flag field, see bio->bi_rw
Lars Ellenberg10f6d9922011-01-24 14:47:09 +01001076 *
1077 * May spread the pages to multiple bios,
1078 * depending on bio_add_page restrictions.
1079 *
1080 * Returns 0 if all bios have been submitted,
1081 * -ENOMEM if we could not allocate enough bios,
1082 * -ENOSPC (any better suggestion?) if we have not been able to bio_add_page a
1083 * single page to an empty bio (which should never happen and likely indicates
1084 * that the lower level IO stack is in some way broken). This has been observed
1085 * on certain Xen deployments.
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001086 */
1087/* TODO allocate from our own bio_set. */
1088int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
1089 const unsigned rw, const int fault_type)
1090{
1091 struct bio *bios = NULL;
1092 struct bio *bio;
1093 struct page *page = e->pages;
1094 sector_t sector = e->sector;
1095 unsigned ds = e->size;
1096 unsigned n_bios = 0;
1097 unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
Lars Ellenberg10f6d9922011-01-24 14:47:09 +01001098 int err = -ENOMEM;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001099
1100 /* In most cases, we will only need one bio. But in case the lower
1101 * level restrictions happen to be different at this offset on this
1102 * side than those of the sending peer, we may need to submit the
1103 * request in more than one bio. */
1104next_bio:
1105 bio = bio_alloc(GFP_NOIO, nr_pages);
1106 if (!bio) {
1107 dev_err(DEV, "submit_ee: Allocation of a bio failed\n");
1108 goto fail;
1109 }
1110 /* > e->sector, unless this is the first bio */
1111 bio->bi_sector = sector;
1112 bio->bi_bdev = mdev->ldev->backing_bdev;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001113 bio->bi_rw = rw;
1114 bio->bi_private = e;
1115 bio->bi_end_io = drbd_endio_sec;
1116
1117 bio->bi_next = bios;
1118 bios = bio;
1119 ++n_bios;
1120
1121 page_chain_for_each(page) {
1122 unsigned len = min_t(unsigned, ds, PAGE_SIZE);
1123 if (!bio_add_page(bio, page, len, 0)) {
Lars Ellenberg10f6d9922011-01-24 14:47:09 +01001124 /* A single page must always be possible!
1125 * But in case it fails anyways,
1126 * we deal with it, and complain (below). */
1127 if (bio->bi_vcnt == 0) {
1128 dev_err(DEV,
1129 "bio_add_page failed for len=%u, "
1130 "bi_vcnt=0 (bi_sector=%llu)\n",
1131 len, (unsigned long long)bio->bi_sector);
1132 err = -ENOSPC;
1133 goto fail;
1134 }
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001135 goto next_bio;
1136 }
1137 ds -= len;
1138 sector += len >> 9;
1139 --nr_pages;
1140 }
1141 D_ASSERT(page == NULL);
1142 D_ASSERT(ds == 0);
1143
1144 atomic_set(&e->pending_bios, n_bios);
1145 do {
1146 bio = bios;
1147 bios = bios->bi_next;
1148 bio->bi_next = NULL;
1149
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001150 drbd_generic_make_request(mdev, fault_type, bio);
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001151 } while (bios);
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001152 return 0;
1153
1154fail:
1155 while (bios) {
1156 bio = bios;
1157 bios = bios->bi_next;
1158 bio_put(bio);
1159 }
Lars Ellenberg10f6d9922011-01-24 14:47:09 +01001160 return err;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001161}
1162
Philipp Reisner02918be2010-08-20 14:35:10 +02001163static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001164{
Philipp Reisner2451fc32010-08-24 13:43:11 +02001165 int rv;
Philipp Reisner02918be2010-08-20 14:35:10 +02001166 struct p_barrier *p = &mdev->data.rbuf.barrier;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001167 struct drbd_epoch *epoch;
1168
Philipp Reisnerb411b362009-09-25 16:07:19 -07001169 inc_unacked(mdev);
1170
Philipp Reisnerb411b362009-09-25 16:07:19 -07001171 mdev->current_epoch->barrier_nr = p->barrier;
1172 rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR);
1173
1174 /* P_BARRIER_ACK may imply that the corresponding extent is dropped from
1175 * the activity log, which means it would not be resynced in case the
1176 * R_PRIMARY crashes now.
1177 * Therefore we must send the barrier_ack after the barrier request was
1178 * completed. */
1179 switch (mdev->write_ordering) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001180 case WO_none:
1181 if (rv == FE_RECYCLED)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001182 return true;
Philipp Reisner2451fc32010-08-24 13:43:11 +02001183
1184 /* receiver context, in the writeout path of the other node.
1185 * avoid potential distributed deadlock */
1186 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1187 if (epoch)
1188 break;
1189 else
1190 dev_warn(DEV, "Allocation of an epoch failed, slowing down\n");
1191 /* Fall through */
Philipp Reisnerb411b362009-09-25 16:07:19 -07001192
1193 case WO_bdev_flush:
1194 case WO_drain_io:
Philipp Reisnerb411b362009-09-25 16:07:19 -07001195 drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
Philipp Reisner2451fc32010-08-24 13:43:11 +02001196 drbd_flush(mdev);
1197
1198 if (atomic_read(&mdev->current_epoch->epoch_size)) {
1199 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1200 if (epoch)
1201 break;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001202 }
1203
Philipp Reisner2451fc32010-08-24 13:43:11 +02001204 epoch = mdev->current_epoch;
1205 wait_event(mdev->ee_wait, atomic_read(&epoch->epoch_size) == 0);
1206
1207 D_ASSERT(atomic_read(&epoch->active) == 0);
1208 D_ASSERT(epoch->flags == 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001209
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001210 return true;
Philipp Reisner2451fc32010-08-24 13:43:11 +02001211 default:
1212 dev_err(DEV, "Strangeness in mdev->write_ordering %d\n", mdev->write_ordering);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001213 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001214 }
1215
1216 epoch->flags = 0;
1217 atomic_set(&epoch->epoch_size, 0);
1218 atomic_set(&epoch->active, 0);
1219
1220 spin_lock(&mdev->epoch_lock);
1221 if (atomic_read(&mdev->current_epoch->epoch_size)) {
1222 list_add(&epoch->list, &mdev->current_epoch->list);
1223 mdev->current_epoch = epoch;
1224 mdev->epochs++;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001225 } else {
1226 /* The current_epoch got recycled while we allocated this one... */
1227 kfree(epoch);
1228 }
1229 spin_unlock(&mdev->epoch_lock);
1230
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001231 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001232}
1233
1234/* used from receive_RSDataReply (recv_resync_read)
1235 * and from receive_Data */
1236static struct drbd_epoch_entry *
1237read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local)
1238{
Lars Ellenberg66660322010-04-06 12:15:04 +02001239 const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001240 struct drbd_epoch_entry *e;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001241 struct page *page;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001242 int dgs, ds, rr;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001243 void *dig_in = mdev->int_dig_in;
1244 void *dig_vv = mdev->int_dig_vv;
Philipp Reisner6b4388a2010-04-26 14:11:45 +02001245 unsigned long *data;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001246
1247 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
1248 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
1249
1250 if (dgs) {
1251 rr = drbd_recv(mdev, dig_in, dgs);
1252 if (rr != dgs) {
Lars Ellenberg0ddc5542011-01-21 12:35:15 +01001253 if (!signal_pending(current))
1254 dev_warn(DEV,
1255 "short read receiving data digest: read %d expected %d\n",
1256 rr, dgs);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001257 return NULL;
1258 }
1259 }
1260
1261 data_size -= dgs;
1262
Philipp Reisnerd07c9c12011-01-20 16:49:33 +01001263 ERR_IF(data_size == 0) return NULL;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001264 ERR_IF(data_size & 0x1ff) return NULL;
Lars Ellenberg1816a2b2010-11-11 15:19:07 +01001265 ERR_IF(data_size > DRBD_MAX_BIO_SIZE) return NULL;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001266
Lars Ellenberg66660322010-04-06 12:15:04 +02001267 /* even though we trust out peer,
1268 * we sometimes have to double check. */
1269 if (sector + (data_size>>9) > capacity) {
1270 dev_err(DEV, "capacity: %llus < sector: %llus + size: %u\n",
1271 (unsigned long long)capacity,
1272 (unsigned long long)sector, data_size);
1273 return NULL;
1274 }
1275
Philipp Reisnerb411b362009-09-25 16:07:19 -07001276 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
1277 * "criss-cross" setup, that might cause write-out on some other DRBD,
1278 * which in turn might block on the other node at this very place. */
1279 e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO);
1280 if (!e)
1281 return NULL;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001282
Philipp Reisnerb411b362009-09-25 16:07:19 -07001283 ds = data_size;
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001284 page = e->pages;
1285 page_chain_for_each(page) {
1286 unsigned len = min_t(int, ds, PAGE_SIZE);
Philipp Reisner6b4388a2010-04-26 14:11:45 +02001287 data = kmap(page);
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001288 rr = drbd_recv(mdev, data, len);
Andreas Gruenbacher0cf9d272010-12-07 10:43:29 +01001289 if (drbd_insert_fault(mdev, DRBD_FAULT_RECEIVE)) {
Philipp Reisner6b4388a2010-04-26 14:11:45 +02001290 dev_err(DEV, "Fault injection: Corrupting data on receive\n");
1291 data[0] = data[0] ^ (unsigned long)-1;
1292 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07001293 kunmap(page);
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001294 if (rr != len) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001295 drbd_free_ee(mdev, e);
Lars Ellenberg0ddc5542011-01-21 12:35:15 +01001296 if (!signal_pending(current))
1297 dev_warn(DEV, "short read receiving data: read %d expected %d\n",
1298 rr, len);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001299 return NULL;
1300 }
1301 ds -= rr;
1302 }
1303
1304 if (dgs) {
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001305 drbd_csum_ee(mdev, mdev->integrity_r_tfm, e, dig_vv);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001306 if (memcmp(dig_in, dig_vv, dgs)) {
Lars Ellenberg470be442010-11-10 10:36:52 +01001307 dev_err(DEV, "Digest integrity check FAILED: %llus +%u\n",
1308 (unsigned long long)sector, data_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001309 drbd_bcast_ee(mdev, "digest failed",
1310 dgs, dig_in, dig_vv, e);
1311 drbd_free_ee(mdev, e);
1312 return NULL;
1313 }
1314 }
1315 mdev->recv_cnt += data_size>>9;
1316 return e;
1317}
1318
1319/* drbd_drain_block() just takes a data block
1320 * out of the socket input buffer, and discards it.
1321 */
1322static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
1323{
1324 struct page *page;
1325 int rr, rv = 1;
1326 void *data;
1327
Lars Ellenbergc3470cd2010-04-01 16:57:19 +02001328 if (!data_size)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001329 return true;
Lars Ellenbergc3470cd2010-04-01 16:57:19 +02001330
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001331 page = drbd_pp_alloc(mdev, 1, 1);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001332
1333 data = kmap(page);
1334 while (data_size) {
1335 rr = drbd_recv(mdev, data, min_t(int, data_size, PAGE_SIZE));
1336 if (rr != min_t(int, data_size, PAGE_SIZE)) {
1337 rv = 0;
Lars Ellenberg0ddc5542011-01-21 12:35:15 +01001338 if (!signal_pending(current))
1339 dev_warn(DEV,
1340 "short read receiving data: read %d expected %d\n",
1341 rr, min_t(int, data_size, PAGE_SIZE));
Philipp Reisnerb411b362009-09-25 16:07:19 -07001342 break;
1343 }
1344 data_size -= rr;
1345 }
1346 kunmap(page);
Lars Ellenberg435f0742010-09-06 12:30:25 +02001347 drbd_pp_free(mdev, page, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001348 return rv;
1349}
1350
1351static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
1352 sector_t sector, int data_size)
1353{
1354 struct bio_vec *bvec;
1355 struct bio *bio;
1356 int dgs, rr, i, expect;
1357 void *dig_in = mdev->int_dig_in;
1358 void *dig_vv = mdev->int_dig_vv;
1359
1360 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
1361 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
1362
1363 if (dgs) {
1364 rr = drbd_recv(mdev, dig_in, dgs);
1365 if (rr != dgs) {
Lars Ellenberg0ddc5542011-01-21 12:35:15 +01001366 if (!signal_pending(current))
1367 dev_warn(DEV,
1368 "short read receiving data reply digest: read %d expected %d\n",
1369 rr, dgs);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001370 return 0;
1371 }
1372 }
1373
1374 data_size -= dgs;
1375
1376 /* optimistically update recv_cnt. if receiving fails below,
1377 * we disconnect anyways, and counters will be reset. */
1378 mdev->recv_cnt += data_size>>9;
1379
1380 bio = req->master_bio;
1381 D_ASSERT(sector == bio->bi_sector);
1382
1383 bio_for_each_segment(bvec, bio, i) {
1384 expect = min_t(int, data_size, bvec->bv_len);
1385 rr = drbd_recv(mdev,
1386 kmap(bvec->bv_page)+bvec->bv_offset,
1387 expect);
1388 kunmap(bvec->bv_page);
1389 if (rr != expect) {
Lars Ellenberg0ddc5542011-01-21 12:35:15 +01001390 if (!signal_pending(current))
1391 dev_warn(DEV, "short read receiving data reply: "
1392 "read %d expected %d\n",
1393 rr, expect);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001394 return 0;
1395 }
1396 data_size -= rr;
1397 }
1398
1399 if (dgs) {
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001400 drbd_csum_bio(mdev, mdev->integrity_r_tfm, bio, dig_vv);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001401 if (memcmp(dig_in, dig_vv, dgs)) {
1402 dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n");
1403 return 0;
1404 }
1405 }
1406
1407 D_ASSERT(data_size == 0);
1408 return 1;
1409}
1410
1411/* e_end_resync_block() is called via
1412 * drbd_process_done_ee() by asender only */
1413static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1414{
1415 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1416 sector_t sector = e->sector;
1417 int ok;
1418
1419 D_ASSERT(hlist_unhashed(&e->colision));
1420
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001421 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001422 drbd_set_in_sync(mdev, sector, e->size);
1423 ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e);
1424 } else {
1425 /* Record failure to sync */
1426 drbd_rs_failed_io(mdev, sector, e->size);
1427
1428 ok = drbd_send_ack(mdev, P_NEG_ACK, e);
1429 }
1430 dec_unacked(mdev);
1431
1432 return ok;
1433}
1434
1435static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local)
1436{
1437 struct drbd_epoch_entry *e;
1438
1439 e = read_in_block(mdev, ID_SYNCER, sector, data_size);
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001440 if (!e)
1441 goto fail;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001442
1443 dec_rs_pending(mdev);
1444
Philipp Reisnerb411b362009-09-25 16:07:19 -07001445 inc_unacked(mdev);
1446 /* corresponding dec_unacked() in e_end_resync_block()
1447 * respective _drbd_clear_done_ee */
1448
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001449 e->w.cb = e_end_resync_block;
1450
Philipp Reisnerb411b362009-09-25 16:07:19 -07001451 spin_lock_irq(&mdev->req_lock);
1452 list_add(&e->w.list, &mdev->sync_ee);
1453 spin_unlock_irq(&mdev->req_lock);
1454
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02001455 atomic_add(data_size >> 9, &mdev->rs_sect_ev);
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001456 if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001457 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001458
Lars Ellenberg10f6d9922011-01-24 14:47:09 +01001459 /* don't care for the reason here */
1460 dev_err(DEV, "submit failed, triggering re-connect\n");
Lars Ellenberg22cc37a2010-09-14 20:40:41 +02001461 spin_lock_irq(&mdev->req_lock);
1462 list_del(&e->w.list);
1463 spin_unlock_irq(&mdev->req_lock);
1464
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001465 drbd_free_ee(mdev, e);
1466fail:
1467 put_ldev(mdev);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001468 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001469}
1470
Philipp Reisner02918be2010-08-20 14:35:10 +02001471static int receive_DataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001472{
1473 struct drbd_request *req;
1474 sector_t sector;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001475 int ok;
Philipp Reisner02918be2010-08-20 14:35:10 +02001476 struct p_data *p = &mdev->data.rbuf.data;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001477
1478 sector = be64_to_cpu(p->sector);
1479
1480 spin_lock_irq(&mdev->req_lock);
1481 req = _ar_id_to_req(mdev, p->block_id, sector);
1482 spin_unlock_irq(&mdev->req_lock);
1483 if (unlikely(!req)) {
1484 dev_err(DEV, "Got a corrupt block_id/sector pair(1).\n");
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001485 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001486 }
1487
1488 /* hlist_del(&req->colision) is done in _req_may_be_done, to avoid
1489 * special casing it there for the various failure cases.
1490 * still no race with drbd_fail_pending_reads */
1491 ok = recv_dless_read(mdev, req, sector, data_size);
1492
1493 if (ok)
1494 req_mod(req, data_received);
1495 /* else: nothing. handled from drbd_disconnect...
1496 * I don't think we may complete this just yet
1497 * in case we are "on-disconnect: freeze" */
1498
1499 return ok;
1500}
1501
Philipp Reisner02918be2010-08-20 14:35:10 +02001502static int receive_RSDataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001503{
1504 sector_t sector;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001505 int ok;
Philipp Reisner02918be2010-08-20 14:35:10 +02001506 struct p_data *p = &mdev->data.rbuf.data;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001507
1508 sector = be64_to_cpu(p->sector);
1509 D_ASSERT(p->block_id == ID_SYNCER);
1510
1511 if (get_ldev(mdev)) {
1512 /* data is submitted to disk within recv_resync_read.
1513 * corresponding put_ldev done below on error,
1514 * or in drbd_endio_write_sec. */
1515 ok = recv_resync_read(mdev, sector, data_size);
1516 } else {
1517 if (__ratelimit(&drbd_ratelimit_state))
1518 dev_err(DEV, "Can not write resync data to local disk.\n");
1519
1520 ok = drbd_drain_block(mdev, data_size);
1521
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02001522 drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001523 }
1524
Philipp Reisner778f2712010-07-06 11:14:00 +02001525 atomic_add(data_size >> 9, &mdev->rs_sect_in);
1526
Philipp Reisnerb411b362009-09-25 16:07:19 -07001527 return ok;
1528}
1529
1530/* e_end_block() is called via drbd_process_done_ee().
1531 * this means this function only runs in the asender thread
1532 */
1533static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1534{
1535 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1536 sector_t sector = e->sector;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001537 int ok = 1, pcmd;
1538
Philipp Reisnerb411b362009-09-25 16:07:19 -07001539 if (mdev->net_conf->wire_protocol == DRBD_PROT_C) {
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001540 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001541 pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
1542 mdev->state.conn <= C_PAUSED_SYNC_T &&
1543 e->flags & EE_MAY_SET_IN_SYNC) ?
1544 P_RS_WRITE_ACK : P_WRITE_ACK;
1545 ok &= drbd_send_ack(mdev, pcmd, e);
1546 if (pcmd == P_RS_WRITE_ACK)
1547 drbd_set_in_sync(mdev, sector, e->size);
1548 } else {
1549 ok = drbd_send_ack(mdev, P_NEG_ACK, e);
1550 /* we expect it to be marked out of sync anyways...
1551 * maybe assert this? */
1552 }
1553 dec_unacked(mdev);
1554 }
1555 /* we delete from the conflict detection hash _after_ we sent out the
1556 * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */
1557 if (mdev->net_conf->two_primaries) {
1558 spin_lock_irq(&mdev->req_lock);
1559 D_ASSERT(!hlist_unhashed(&e->colision));
1560 hlist_del_init(&e->colision);
1561 spin_unlock_irq(&mdev->req_lock);
1562 } else {
1563 D_ASSERT(hlist_unhashed(&e->colision));
1564 }
1565
1566 drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
1567
1568 return ok;
1569}
1570
1571static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1572{
1573 struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
1574 int ok = 1;
1575
1576 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
1577 ok = drbd_send_ack(mdev, P_DISCARD_ACK, e);
1578
1579 spin_lock_irq(&mdev->req_lock);
1580 D_ASSERT(!hlist_unhashed(&e->colision));
1581 hlist_del_init(&e->colision);
1582 spin_unlock_irq(&mdev->req_lock);
1583
1584 dec_unacked(mdev);
1585
1586 return ok;
1587}
1588
1589/* Called from receive_Data.
1590 * Synchronize packets on sock with packets on msock.
1591 *
1592 * This is here so even when a P_DATA packet traveling via sock overtook an Ack
1593 * packet traveling on msock, they are still processed in the order they have
1594 * been sent.
1595 *
1596 * Note: we don't care for Ack packets overtaking P_DATA packets.
1597 *
1598 * In case packet_seq is larger than mdev->peer_seq number, there are
1599 * outstanding packets on the msock. We wait for them to arrive.
1600 * In case we are the logically next packet, we update mdev->peer_seq
1601 * ourselves. Correctly handles 32bit wrap around.
1602 *
1603 * Assume we have a 10 GBit connection, that is about 1<<30 byte per second,
1604 * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds
1605 * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have
1606 * 1<<9 == 512 seconds aka ages for the 32bit wrap around...
1607 *
1608 * returns 0 if we may process the packet,
1609 * -ERESTARTSYS if we were interrupted (by disconnect signal). */
1610static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq)
1611{
1612 DEFINE_WAIT(wait);
1613 unsigned int p_seq;
1614 long timeout;
1615 int ret = 0;
1616 spin_lock(&mdev->peer_seq_lock);
1617 for (;;) {
1618 prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE);
1619 if (seq_le(packet_seq, mdev->peer_seq+1))
1620 break;
1621 if (signal_pending(current)) {
1622 ret = -ERESTARTSYS;
1623 break;
1624 }
1625 p_seq = mdev->peer_seq;
1626 spin_unlock(&mdev->peer_seq_lock);
1627 timeout = schedule_timeout(30*HZ);
1628 spin_lock(&mdev->peer_seq_lock);
1629 if (timeout == 0 && p_seq == mdev->peer_seq) {
1630 ret = -ETIMEDOUT;
1631 dev_err(DEV, "ASSERT FAILED waited 30 seconds for sequence update, forcing reconnect\n");
1632 break;
1633 }
1634 }
1635 finish_wait(&mdev->seq_wait, &wait);
1636 if (mdev->peer_seq+1 == packet_seq)
1637 mdev->peer_seq++;
1638 spin_unlock(&mdev->peer_seq_lock);
1639 return ret;
1640}
1641
Lars Ellenberg688593c2010-11-17 22:25:03 +01001642/* see also bio_flags_to_wire()
1643 * DRBD_REQ_*, because we need to semantically map the flags to data packet
1644 * flags and back. We may replicate to other kernel versions. */
1645static unsigned long wire_flags_to_bio(struct drbd_conf *mdev, u32 dpf)
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02001646{
Lars Ellenberg688593c2010-11-17 22:25:03 +01001647 return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
1648 (dpf & DP_FUA ? REQ_FUA : 0) |
1649 (dpf & DP_FLUSH ? REQ_FLUSH : 0) |
1650 (dpf & DP_DISCARD ? REQ_DISCARD : 0);
Philipp Reisner76d2e7e2010-08-25 11:58:05 +02001651}
1652
Philipp Reisnerb411b362009-09-25 16:07:19 -07001653/* mirrored write */
Philipp Reisner02918be2010-08-20 14:35:10 +02001654static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001655{
1656 sector_t sector;
1657 struct drbd_epoch_entry *e;
Philipp Reisner02918be2010-08-20 14:35:10 +02001658 struct p_data *p = &mdev->data.rbuf.data;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001659 int rw = WRITE;
1660 u32 dp_flags;
1661
Philipp Reisnerb411b362009-09-25 16:07:19 -07001662 if (!get_ldev(mdev)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001663 spin_lock(&mdev->peer_seq_lock);
1664 if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num))
1665 mdev->peer_seq++;
1666 spin_unlock(&mdev->peer_seq_lock);
1667
Lars Ellenberg2b2bf212010-10-06 11:46:55 +02001668 drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001669 atomic_inc(&mdev->current_epoch->epoch_size);
1670 return drbd_drain_block(mdev, data_size);
1671 }
1672
1673 /* get_ldev(mdev) successful.
1674 * Corresponding put_ldev done either below (on various errors),
1675 * or in drbd_endio_write_sec, if we successfully submit the data at
1676 * the end of this function. */
1677
1678 sector = be64_to_cpu(p->sector);
1679 e = read_in_block(mdev, p->block_id, sector, data_size);
1680 if (!e) {
1681 put_ldev(mdev);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001682 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001683 }
1684
Philipp Reisnerb411b362009-09-25 16:07:19 -07001685 e->w.cb = e_end_block;
1686
Lars Ellenberg688593c2010-11-17 22:25:03 +01001687 dp_flags = be32_to_cpu(p->dp_flags);
1688 rw |= wire_flags_to_bio(mdev, dp_flags);
1689
1690 if (dp_flags & DP_MAY_SET_IN_SYNC)
1691 e->flags |= EE_MAY_SET_IN_SYNC;
1692
Philipp Reisnerb411b362009-09-25 16:07:19 -07001693 spin_lock(&mdev->epoch_lock);
1694 e->epoch = mdev->current_epoch;
1695 atomic_inc(&e->epoch->epoch_size);
1696 atomic_inc(&e->epoch->active);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001697 spin_unlock(&mdev->epoch_lock);
1698
Philipp Reisnerb411b362009-09-25 16:07:19 -07001699 /* I'm the receiver, I do hold a net_cnt reference. */
1700 if (!mdev->net_conf->two_primaries) {
1701 spin_lock_irq(&mdev->req_lock);
1702 } else {
1703 /* don't get the req_lock yet,
1704 * we may sleep in drbd_wait_peer_seq */
1705 const int size = e->size;
1706 const int discard = test_bit(DISCARD_CONCURRENT, &mdev->flags);
1707 DEFINE_WAIT(wait);
1708 struct drbd_request *i;
1709 struct hlist_node *n;
1710 struct hlist_head *slot;
1711 int first;
1712
1713 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
1714 BUG_ON(mdev->ee_hash == NULL);
1715 BUG_ON(mdev->tl_hash == NULL);
1716
1717 /* conflict detection and handling:
1718 * 1. wait on the sequence number,
1719 * in case this data packet overtook ACK packets.
1720 * 2. check our hash tables for conflicting requests.
1721 * we only need to walk the tl_hash, since an ee can not
1722 * have a conflict with an other ee: on the submitting
1723 * node, the corresponding req had already been conflicting,
1724 * and a conflicting req is never sent.
1725 *
1726 * Note: for two_primaries, we are protocol C,
1727 * so there cannot be any request that is DONE
1728 * but still on the transfer log.
1729 *
1730 * unconditionally add to the ee_hash.
1731 *
1732 * if no conflicting request is found:
1733 * submit.
1734 *
1735 * if any conflicting request is found
1736 * that has not yet been acked,
1737 * AND I have the "discard concurrent writes" flag:
1738 * queue (via done_ee) the P_DISCARD_ACK; OUT.
1739 *
1740 * if any conflicting request is found:
1741 * block the receiver, waiting on misc_wait
1742 * until no more conflicting requests are there,
1743 * or we get interrupted (disconnect).
1744 *
1745 * we do not just write after local io completion of those
1746 * requests, but only after req is done completely, i.e.
1747 * we wait for the P_DISCARD_ACK to arrive!
1748 *
1749 * then proceed normally, i.e. submit.
1750 */
1751 if (drbd_wait_peer_seq(mdev, be32_to_cpu(p->seq_num)))
1752 goto out_interrupted;
1753
1754 spin_lock_irq(&mdev->req_lock);
1755
1756 hlist_add_head(&e->colision, ee_hash_slot(mdev, sector));
1757
1758#define OVERLAPS overlaps(i->sector, i->size, sector, size)
1759 slot = tl_hash_slot(mdev, sector);
1760 first = 1;
1761 for (;;) {
1762 int have_unacked = 0;
1763 int have_conflict = 0;
1764 prepare_to_wait(&mdev->misc_wait, &wait,
1765 TASK_INTERRUPTIBLE);
1766 hlist_for_each_entry(i, n, slot, colision) {
1767 if (OVERLAPS) {
1768 /* only ALERT on first iteration,
1769 * we may be woken up early... */
1770 if (first)
1771 dev_alert(DEV, "%s[%u] Concurrent local write detected!"
1772 " new: %llus +%u; pending: %llus +%u\n",
1773 current->comm, current->pid,
1774 (unsigned long long)sector, size,
1775 (unsigned long long)i->sector, i->size);
1776 if (i->rq_state & RQ_NET_PENDING)
1777 ++have_unacked;
1778 ++have_conflict;
1779 }
1780 }
1781#undef OVERLAPS
1782 if (!have_conflict)
1783 break;
1784
1785 /* Discard Ack only for the _first_ iteration */
1786 if (first && discard && have_unacked) {
1787 dev_alert(DEV, "Concurrent write! [DISCARD BY FLAG] sec=%llus\n",
1788 (unsigned long long)sector);
1789 inc_unacked(mdev);
1790 e->w.cb = e_send_discard_ack;
1791 list_add_tail(&e->w.list, &mdev->done_ee);
1792
1793 spin_unlock_irq(&mdev->req_lock);
1794
1795 /* we could probably send that P_DISCARD_ACK ourselves,
1796 * but I don't like the receiver using the msock */
1797
1798 put_ldev(mdev);
1799 wake_asender(mdev);
1800 finish_wait(&mdev->misc_wait, &wait);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001801 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001802 }
1803
1804 if (signal_pending(current)) {
1805 hlist_del_init(&e->colision);
1806
1807 spin_unlock_irq(&mdev->req_lock);
1808
1809 finish_wait(&mdev->misc_wait, &wait);
1810 goto out_interrupted;
1811 }
1812
1813 spin_unlock_irq(&mdev->req_lock);
1814 if (first) {
1815 first = 0;
1816 dev_alert(DEV, "Concurrent write! [W AFTERWARDS] "
1817 "sec=%llus\n", (unsigned long long)sector);
1818 } else if (discard) {
1819 /* we had none on the first iteration.
1820 * there must be none now. */
1821 D_ASSERT(have_unacked == 0);
1822 }
1823 schedule();
1824 spin_lock_irq(&mdev->req_lock);
1825 }
1826 finish_wait(&mdev->misc_wait, &wait);
1827 }
1828
1829 list_add(&e->w.list, &mdev->active_ee);
1830 spin_unlock_irq(&mdev->req_lock);
1831
1832 switch (mdev->net_conf->wire_protocol) {
1833 case DRBD_PROT_C:
1834 inc_unacked(mdev);
1835 /* corresponding dec_unacked() in e_end_block()
1836 * respective _drbd_clear_done_ee */
1837 break;
1838 case DRBD_PROT_B:
1839 /* I really don't like it that the receiver thread
1840 * sends on the msock, but anyways */
1841 drbd_send_ack(mdev, P_RECV_ACK, e);
1842 break;
1843 case DRBD_PROT_A:
1844 /* nothing to do */
1845 break;
1846 }
1847
Lars Ellenberg6719fb02010-10-18 23:04:07 +02001848 if (mdev->state.pdsk < D_INCONSISTENT) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001849 /* In case we have the only disk of the cluster, */
1850 drbd_set_out_of_sync(mdev, e->sector, e->size);
1851 e->flags |= EE_CALL_AL_COMPLETE_IO;
Lars Ellenberg6719fb02010-10-18 23:04:07 +02001852 e->flags &= ~EE_MAY_SET_IN_SYNC;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001853 drbd_al_begin_io(mdev, e->sector);
1854 }
1855
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001856 if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001857 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001858
Lars Ellenberg10f6d9922011-01-24 14:47:09 +01001859 /* don't care for the reason here */
1860 dev_err(DEV, "submit failed, triggering re-connect\n");
Lars Ellenberg22cc37a2010-09-14 20:40:41 +02001861 spin_lock_irq(&mdev->req_lock);
1862 list_del(&e->w.list);
1863 hlist_del_init(&e->colision);
1864 spin_unlock_irq(&mdev->req_lock);
1865 if (e->flags & EE_CALL_AL_COMPLETE_IO)
1866 drbd_al_complete_io(mdev, e->sector);
1867
Philipp Reisnerb411b362009-09-25 16:07:19 -07001868out_interrupted:
Lars Ellenberg10f6d9922011-01-24 14:47:09 +01001869 drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + EV_CLEANUP);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001870 put_ldev(mdev);
1871 drbd_free_ee(mdev, e);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001872 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001873}
1874
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02001875/* We may throttle resync, if the lower device seems to be busy,
1876 * and current sync rate is above c_min_rate.
1877 *
1878 * To decide whether or not the lower device is busy, we use a scheme similar
1879 * to MD RAID is_mddev_idle(): if the partition stats reveal "significant"
1880 * (more than 64 sectors) of activity we cannot account for with our own resync
1881 * activity, it obviously is "busy".
1882 *
1883 * The current sync rate used here uses only the most recent two step marks,
1884 * to have a short time average so we can react faster.
1885 */
Philipp Reisnere3555d82010-11-07 15:56:29 +01001886int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector)
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02001887{
1888 struct gendisk *disk = mdev->ldev->backing_bdev->bd_contains->bd_disk;
1889 unsigned long db, dt, dbdt;
Philipp Reisnere3555d82010-11-07 15:56:29 +01001890 struct lc_element *tmp;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02001891 int curr_events;
1892 int throttle = 0;
1893
1894 /* feature disabled? */
1895 if (mdev->sync_conf.c_min_rate == 0)
1896 return 0;
1897
Philipp Reisnere3555d82010-11-07 15:56:29 +01001898 spin_lock_irq(&mdev->al_lock);
1899 tmp = lc_find(mdev->resync, BM_SECT_TO_EXT(sector));
1900 if (tmp) {
1901 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
1902 if (test_bit(BME_PRIORITY, &bm_ext->flags)) {
1903 spin_unlock_irq(&mdev->al_lock);
1904 return 0;
1905 }
1906 /* Do not slow down if app IO is already waiting for this extent */
1907 }
1908 spin_unlock_irq(&mdev->al_lock);
1909
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02001910 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
1911 (int)part_stat_read(&disk->part0, sectors[1]) -
1912 atomic_read(&mdev->rs_sect_ev);
Philipp Reisnere3555d82010-11-07 15:56:29 +01001913
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02001914 if (!mdev->rs_last_events || curr_events - mdev->rs_last_events > 64) {
1915 unsigned long rs_left;
1916 int i;
1917
1918 mdev->rs_last_events = curr_events;
1919
1920 /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
1921 * approx. */
Lars Ellenberg2649f082010-11-05 10:05:47 +01001922 i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
1923
1924 if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
1925 rs_left = mdev->ov_left;
1926 else
1927 rs_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02001928
1929 dt = ((long)jiffies - (long)mdev->rs_mark_time[i]) / HZ;
1930 if (!dt)
1931 dt++;
1932 db = mdev->rs_mark_left[i] - rs_left;
1933 dbdt = Bit2KB(db/dt);
1934
1935 if (dbdt > mdev->sync_conf.c_min_rate)
1936 throttle = 1;
1937 }
1938 return throttle;
1939}
1940
1941
Philipp Reisner02918be2010-08-20 14:35:10 +02001942static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int digest_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07001943{
1944 sector_t sector;
1945 const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
1946 struct drbd_epoch_entry *e;
1947 struct digest_info *di = NULL;
Philipp Reisnerb18b37b2010-10-13 15:32:44 +02001948 int size, verb;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001949 unsigned int fault_type;
Philipp Reisner02918be2010-08-20 14:35:10 +02001950 struct p_block_req *p = &mdev->data.rbuf.block_req;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001951
1952 sector = be64_to_cpu(p->sector);
1953 size = be32_to_cpu(p->blksize);
1954
Lars Ellenberg1816a2b2010-11-11 15:19:07 +01001955 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001956 dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
1957 (unsigned long long)sector, size);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001958 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001959 }
1960 if (sector + (size>>9) > capacity) {
1961 dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
1962 (unsigned long long)sector, size);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01001963 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07001964 }
1965
1966 if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) {
Philipp Reisnerb18b37b2010-10-13 15:32:44 +02001967 verb = 1;
1968 switch (cmd) {
1969 case P_DATA_REQUEST:
1970 drbd_send_ack_rp(mdev, P_NEG_DREPLY, p);
1971 break;
1972 case P_RS_DATA_REQUEST:
1973 case P_CSUM_RS_REQUEST:
1974 case P_OV_REQUEST:
1975 drbd_send_ack_rp(mdev, P_NEG_RS_DREPLY , p);
1976 break;
1977 case P_OV_REPLY:
1978 verb = 0;
1979 dec_rs_pending(mdev);
1980 drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, ID_IN_SYNC);
1981 break;
1982 default:
1983 dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
1984 cmdname(cmd));
1985 }
1986 if (verb && __ratelimit(&drbd_ratelimit_state))
Philipp Reisnerb411b362009-09-25 16:07:19 -07001987 dev_err(DEV, "Can not satisfy peer's read request, "
1988 "no local data.\n");
Philipp Reisnerb18b37b2010-10-13 15:32:44 +02001989
Lars Ellenberga821cc42010-09-06 12:31:37 +02001990 /* drain possibly payload */
1991 return drbd_drain_block(mdev, digest_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001992 }
1993
1994 /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
1995 * "criss-cross" setup, that might cause write-out on some other DRBD,
1996 * which in turn might block on the other node at this very place. */
1997 e = drbd_alloc_ee(mdev, p->block_id, sector, size, GFP_NOIO);
1998 if (!e) {
1999 put_ldev(mdev);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002000 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002001 }
2002
Philipp Reisner02918be2010-08-20 14:35:10 +02002003 switch (cmd) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07002004 case P_DATA_REQUEST:
2005 e->w.cb = w_e_end_data_req;
2006 fault_type = DRBD_FAULT_DT_RD;
Lars Ellenberg80a40e42010-08-11 23:28:00 +02002007 /* application IO, don't drbd_rs_begin_io */
2008 goto submit;
2009
Philipp Reisnerb411b362009-09-25 16:07:19 -07002010 case P_RS_DATA_REQUEST:
2011 e->w.cb = w_e_end_rsdata_req;
2012 fault_type = DRBD_FAULT_RS_RD;
Lars Ellenberg5f9915b2010-11-09 14:15:24 +01002013 /* used in the sector offset progress display */
2014 mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002015 break;
2016
2017 case P_OV_REPLY:
2018 case P_CSUM_RS_REQUEST:
2019 fault_type = DRBD_FAULT_RS_RD;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002020 di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO);
2021 if (!di)
2022 goto out_free_e;
2023
2024 di->digest_size = digest_size;
2025 di->digest = (((char *)di)+sizeof(struct digest_info));
2026
Lars Ellenbergc36c3ce2010-08-11 20:42:55 +02002027 e->digest = di;
2028 e->flags |= EE_HAS_DIGEST;
2029
Philipp Reisnerb411b362009-09-25 16:07:19 -07002030 if (drbd_recv(mdev, di->digest, digest_size) != digest_size)
2031 goto out_free_e;
2032
Philipp Reisner02918be2010-08-20 14:35:10 +02002033 if (cmd == P_CSUM_RS_REQUEST) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07002034 D_ASSERT(mdev->agreed_pro_version >= 89);
2035 e->w.cb = w_e_end_csum_rs_req;
Lars Ellenberg5f9915b2010-11-09 14:15:24 +01002036 /* used in the sector offset progress display */
2037 mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
Philipp Reisner02918be2010-08-20 14:35:10 +02002038 } else if (cmd == P_OV_REPLY) {
Lars Ellenberg2649f082010-11-05 10:05:47 +01002039 /* track progress, we may need to throttle */
2040 atomic_add(size >> 9, &mdev->rs_sect_in);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002041 e->w.cb = w_e_end_ov_reply;
2042 dec_rs_pending(mdev);
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02002043 /* drbd_rs_begin_io done when we sent this request,
2044 * but accounting still needs to be done. */
2045 goto submit_for_resync;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002046 }
2047 break;
2048
2049 case P_OV_REQUEST:
Philipp Reisnerb411b362009-09-25 16:07:19 -07002050 if (mdev->ov_start_sector == ~(sector_t)0 &&
2051 mdev->agreed_pro_version >= 90) {
Lars Ellenbergde228bb2010-11-05 09:43:15 +01002052 unsigned long now = jiffies;
2053 int i;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002054 mdev->ov_start_sector = sector;
2055 mdev->ov_position = sector;
Lars Ellenberg30b743a2010-11-05 09:39:06 +01002056 mdev->ov_left = drbd_bm_bits(mdev) - BM_SECT_TO_BIT(sector);
2057 mdev->rs_total = mdev->ov_left;
Lars Ellenbergde228bb2010-11-05 09:43:15 +01002058 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
2059 mdev->rs_mark_left[i] = mdev->ov_left;
2060 mdev->rs_mark_time[i] = now;
2061 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002062 dev_info(DEV, "Online Verify start sector: %llu\n",
2063 (unsigned long long)sector);
2064 }
2065 e->w.cb = w_e_end_ov_req;
2066 fault_type = DRBD_FAULT_RS_RD;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002067 break;
2068
Philipp Reisnerb411b362009-09-25 16:07:19 -07002069 default:
2070 dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
Philipp Reisner02918be2010-08-20 14:35:10 +02002071 cmdname(cmd));
Philipp Reisnerb411b362009-09-25 16:07:19 -07002072 fault_type = DRBD_FAULT_MAX;
Lars Ellenberg80a40e42010-08-11 23:28:00 +02002073 goto out_free_e;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002074 }
2075
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02002076 /* Throttle, drbd_rs_begin_io and submit should become asynchronous
2077 * wrt the receiver, but it is not as straightforward as it may seem.
2078 * Various places in the resync start and stop logic assume resync
2079 * requests are processed in order, requeuing this on the worker thread
2080 * introduces a bunch of new code for synchronization between threads.
2081 *
2082 * Unlimited throttling before drbd_rs_begin_io may stall the resync
2083 * "forever", throttling after drbd_rs_begin_io will lock that extent
2084 * for application writes for the same time. For now, just throttle
2085 * here, where the rest of the code expects the receiver to sleep for
2086 * a while, anyways.
2087 */
Philipp Reisnerb411b362009-09-25 16:07:19 -07002088
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02002089 /* Throttle before drbd_rs_begin_io, as that locks out application IO;
2090 * this defers syncer requests for some time, before letting at least
2091 * on request through. The resync controller on the receiving side
2092 * will adapt to the incoming rate accordingly.
2093 *
2094 * We cannot throttle here if remote is Primary/SyncTarget:
2095 * we would also throttle its application reads.
2096 * In that case, throttling is done on the SyncTarget only.
2097 */
Philipp Reisnere3555d82010-11-07 15:56:29 +01002098 if (mdev->state.peer != R_PRIMARY && drbd_rs_should_slow_down(mdev, sector))
2099 schedule_timeout_uninterruptible(HZ/10);
2100 if (drbd_rs_begin_io(mdev, sector))
Lars Ellenberg80a40e42010-08-11 23:28:00 +02002101 goto out_free_e;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002102
Lars Ellenberg0f0601f2010-08-11 23:40:24 +02002103submit_for_resync:
2104 atomic_add(size >> 9, &mdev->rs_sect_ev);
2105
Lars Ellenberg80a40e42010-08-11 23:28:00 +02002106submit:
Philipp Reisnerb411b362009-09-25 16:07:19 -07002107 inc_unacked(mdev);
Lars Ellenberg80a40e42010-08-11 23:28:00 +02002108 spin_lock_irq(&mdev->req_lock);
2109 list_add_tail(&e->w.list, &mdev->read_ee);
2110 spin_unlock_irq(&mdev->req_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002111
Lars Ellenberg45bb9122010-05-14 17:10:48 +02002112 if (drbd_submit_ee(mdev, e, READ, fault_type) == 0)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002113 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002114
Lars Ellenberg10f6d9922011-01-24 14:47:09 +01002115 /* don't care for the reason here */
2116 dev_err(DEV, "submit failed, triggering re-connect\n");
Lars Ellenberg22cc37a2010-09-14 20:40:41 +02002117 spin_lock_irq(&mdev->req_lock);
2118 list_del(&e->w.list);
2119 spin_unlock_irq(&mdev->req_lock);
2120 /* no drbd_rs_complete_io(), we are dropping the connection anyways */
2121
Philipp Reisnerb411b362009-09-25 16:07:19 -07002122out_free_e:
Philipp Reisnerb411b362009-09-25 16:07:19 -07002123 put_ldev(mdev);
2124 drbd_free_ee(mdev, e);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002125 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002126}
2127
2128static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
2129{
2130 int self, peer, rv = -100;
2131 unsigned long ch_self, ch_peer;
2132
2133 self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
2134 peer = mdev->p_uuid[UI_BITMAP] & 1;
2135
2136 ch_peer = mdev->p_uuid[UI_SIZE];
2137 ch_self = mdev->comm_bm_set;
2138
2139 switch (mdev->net_conf->after_sb_0p) {
2140 case ASB_CONSENSUS:
2141 case ASB_DISCARD_SECONDARY:
2142 case ASB_CALL_HELPER:
2143 dev_err(DEV, "Configuration error.\n");
2144 break;
2145 case ASB_DISCONNECT:
2146 break;
2147 case ASB_DISCARD_YOUNGER_PRI:
2148 if (self == 0 && peer == 1) {
2149 rv = -1;
2150 break;
2151 }
2152 if (self == 1 && peer == 0) {
2153 rv = 1;
2154 break;
2155 }
2156 /* Else fall through to one of the other strategies... */
2157 case ASB_DISCARD_OLDER_PRI:
2158 if (self == 0 && peer == 1) {
2159 rv = 1;
2160 break;
2161 }
2162 if (self == 1 && peer == 0) {
2163 rv = -1;
2164 break;
2165 }
2166 /* Else fall through to one of the other strategies... */
Lars Ellenbergad19bf62009-10-14 09:36:49 +02002167 dev_warn(DEV, "Discard younger/older primary did not find a decision\n"
Philipp Reisnerb411b362009-09-25 16:07:19 -07002168 "Using discard-least-changes instead\n");
2169 case ASB_DISCARD_ZERO_CHG:
2170 if (ch_peer == 0 && ch_self == 0) {
2171 rv = test_bit(DISCARD_CONCURRENT, &mdev->flags)
2172 ? -1 : 1;
2173 break;
2174 } else {
2175 if (ch_peer == 0) { rv = 1; break; }
2176 if (ch_self == 0) { rv = -1; break; }
2177 }
2178 if (mdev->net_conf->after_sb_0p == ASB_DISCARD_ZERO_CHG)
2179 break;
2180 case ASB_DISCARD_LEAST_CHG:
2181 if (ch_self < ch_peer)
2182 rv = -1;
2183 else if (ch_self > ch_peer)
2184 rv = 1;
2185 else /* ( ch_self == ch_peer ) */
2186 /* Well, then use something else. */
2187 rv = test_bit(DISCARD_CONCURRENT, &mdev->flags)
2188 ? -1 : 1;
2189 break;
2190 case ASB_DISCARD_LOCAL:
2191 rv = -1;
2192 break;
2193 case ASB_DISCARD_REMOTE:
2194 rv = 1;
2195 }
2196
2197 return rv;
2198}
2199
2200static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
2201{
Andreas Gruenbacher6184ea22010-12-09 14:23:27 +01002202 int hg, rv = -100;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002203
2204 switch (mdev->net_conf->after_sb_1p) {
2205 case ASB_DISCARD_YOUNGER_PRI:
2206 case ASB_DISCARD_OLDER_PRI:
2207 case ASB_DISCARD_LEAST_CHG:
2208 case ASB_DISCARD_LOCAL:
2209 case ASB_DISCARD_REMOTE:
2210 dev_err(DEV, "Configuration error.\n");
2211 break;
2212 case ASB_DISCONNECT:
2213 break;
2214 case ASB_CONSENSUS:
2215 hg = drbd_asb_recover_0p(mdev);
2216 if (hg == -1 && mdev->state.role == R_SECONDARY)
2217 rv = hg;
2218 if (hg == 1 && mdev->state.role == R_PRIMARY)
2219 rv = hg;
2220 break;
2221 case ASB_VIOLENTLY:
2222 rv = drbd_asb_recover_0p(mdev);
2223 break;
2224 case ASB_DISCARD_SECONDARY:
2225 return mdev->state.role == R_PRIMARY ? 1 : -1;
2226 case ASB_CALL_HELPER:
2227 hg = drbd_asb_recover_0p(mdev);
2228 if (hg == -1 && mdev->state.role == R_PRIMARY) {
Andreas Gruenbacherbb437942010-12-09 14:02:35 +01002229 enum drbd_state_rv rv2;
2230
2231 drbd_set_role(mdev, R_SECONDARY, 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002232 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2233 * we might be here in C_WF_REPORT_PARAMS which is transient.
2234 * we do not need to wait for the after state change work either. */
Andreas Gruenbacherbb437942010-12-09 14:02:35 +01002235 rv2 = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
2236 if (rv2 != SS_SUCCESS) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07002237 drbd_khelper(mdev, "pri-lost-after-sb");
2238 } else {
2239 dev_warn(DEV, "Successfully gave up primary role.\n");
2240 rv = hg;
2241 }
2242 } else
2243 rv = hg;
2244 }
2245
2246 return rv;
2247}
2248
2249static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local)
2250{
Andreas Gruenbacher6184ea22010-12-09 14:23:27 +01002251 int hg, rv = -100;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002252
2253 switch (mdev->net_conf->after_sb_2p) {
2254 case ASB_DISCARD_YOUNGER_PRI:
2255 case ASB_DISCARD_OLDER_PRI:
2256 case ASB_DISCARD_LEAST_CHG:
2257 case ASB_DISCARD_LOCAL:
2258 case ASB_DISCARD_REMOTE:
2259 case ASB_CONSENSUS:
2260 case ASB_DISCARD_SECONDARY:
2261 dev_err(DEV, "Configuration error.\n");
2262 break;
2263 case ASB_VIOLENTLY:
2264 rv = drbd_asb_recover_0p(mdev);
2265 break;
2266 case ASB_DISCONNECT:
2267 break;
2268 case ASB_CALL_HELPER:
2269 hg = drbd_asb_recover_0p(mdev);
2270 if (hg == -1) {
Andreas Gruenbacherbb437942010-12-09 14:02:35 +01002271 enum drbd_state_rv rv2;
2272
Philipp Reisnerb411b362009-09-25 16:07:19 -07002273 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
2274 * we might be here in C_WF_REPORT_PARAMS which is transient.
2275 * we do not need to wait for the after state change work either. */
Andreas Gruenbacherbb437942010-12-09 14:02:35 +01002276 rv2 = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY));
2277 if (rv2 != SS_SUCCESS) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07002278 drbd_khelper(mdev, "pri-lost-after-sb");
2279 } else {
2280 dev_warn(DEV, "Successfully gave up primary role.\n");
2281 rv = hg;
2282 }
2283 } else
2284 rv = hg;
2285 }
2286
2287 return rv;
2288}
2289
2290static void drbd_uuid_dump(struct drbd_conf *mdev, char *text, u64 *uuid,
2291 u64 bits, u64 flags)
2292{
2293 if (!uuid) {
2294 dev_info(DEV, "%s uuid info vanished while I was looking!\n", text);
2295 return;
2296 }
2297 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
2298 text,
2299 (unsigned long long)uuid[UI_CURRENT],
2300 (unsigned long long)uuid[UI_BITMAP],
2301 (unsigned long long)uuid[UI_HISTORY_START],
2302 (unsigned long long)uuid[UI_HISTORY_END],
2303 (unsigned long long)bits,
2304 (unsigned long long)flags);
2305}
2306
2307/*
2308 100 after split brain try auto recover
2309 2 C_SYNC_SOURCE set BitMap
2310 1 C_SYNC_SOURCE use BitMap
2311 0 no Sync
2312 -1 C_SYNC_TARGET use BitMap
2313 -2 C_SYNC_TARGET set BitMap
2314 -100 after split brain, disconnect
2315-1000 unrelated data
Philipp Reisner4a23f262011-01-11 17:42:17 +01002316-1091 requires proto 91
2317-1096 requires proto 96
Philipp Reisnerb411b362009-09-25 16:07:19 -07002318 */
2319static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(local)
2320{
2321 u64 self, peer;
2322 int i, j;
2323
2324 self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
2325 peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
2326
2327 *rule_nr = 10;
2328 if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED)
2329 return 0;
2330
2331 *rule_nr = 20;
2332 if ((self == UUID_JUST_CREATED || self == (u64)0) &&
2333 peer != UUID_JUST_CREATED)
2334 return -2;
2335
2336 *rule_nr = 30;
2337 if (self != UUID_JUST_CREATED &&
2338 (peer == UUID_JUST_CREATED || peer == (u64)0))
2339 return 2;
2340
2341 if (self == peer) {
2342 int rct, dc; /* roles at crash time */
2343
2344 if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) {
2345
2346 if (mdev->agreed_pro_version < 91)
Philipp Reisner4a23f262011-01-11 17:42:17 +01002347 return -1091;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002348
2349 if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
2350 (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
2351 dev_info(DEV, "was SyncSource, missed the resync finished event, corrected myself:\n");
2352 drbd_uuid_set_bm(mdev, 0UL);
2353
2354 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
2355 mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
2356 *rule_nr = 34;
2357 } else {
2358 dev_info(DEV, "was SyncSource (peer failed to write sync_uuid)\n");
2359 *rule_nr = 36;
2360 }
2361
2362 return 1;
2363 }
2364
2365 if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) {
2366
2367 if (mdev->agreed_pro_version < 91)
Philipp Reisner4a23f262011-01-11 17:42:17 +01002368 return -1091;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002369
2370 if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) &&
2371 (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
2372 dev_info(DEV, "was SyncTarget, peer missed the resync finished event, corrected peer:\n");
2373
2374 mdev->p_uuid[UI_HISTORY_START + 1] = mdev->p_uuid[UI_HISTORY_START];
2375 mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_BITMAP];
2376 mdev->p_uuid[UI_BITMAP] = 0UL;
2377
2378 drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
2379 *rule_nr = 35;
2380 } else {
2381 dev_info(DEV, "was SyncTarget (failed to write sync_uuid)\n");
2382 *rule_nr = 37;
2383 }
2384
2385 return -1;
2386 }
2387
2388 /* Common power [off|failure] */
2389 rct = (test_bit(CRASHED_PRIMARY, &mdev->flags) ? 1 : 0) +
2390 (mdev->p_uuid[UI_FLAGS] & 2);
2391 /* lowest bit is set when we were primary,
2392 * next bit (weight 2) is set when peer was primary */
2393 *rule_nr = 40;
2394
2395 switch (rct) {
2396 case 0: /* !self_pri && !peer_pri */ return 0;
2397 case 1: /* self_pri && !peer_pri */ return 1;
2398 case 2: /* !self_pri && peer_pri */ return -1;
2399 case 3: /* self_pri && peer_pri */
2400 dc = test_bit(DISCARD_CONCURRENT, &mdev->flags);
2401 return dc ? -1 : 1;
2402 }
2403 }
2404
2405 *rule_nr = 50;
2406 peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
2407 if (self == peer)
2408 return -1;
2409
2410 *rule_nr = 51;
2411 peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
2412 if (self == peer) {
Philipp Reisner4a23f262011-01-11 17:42:17 +01002413 if (mdev->agreed_pro_version < 96 ?
2414 (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) ==
2415 (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) :
2416 peer + UUID_NEW_BM_OFFSET == (mdev->p_uuid[UI_BITMAP] & ~((u64)1))) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07002417 /* The last P_SYNC_UUID did not get though. Undo the last start of
2418 resync as sync source modifications of the peer's UUIDs. */
2419
2420 if (mdev->agreed_pro_version < 91)
Philipp Reisner4a23f262011-01-11 17:42:17 +01002421 return -1091;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002422
2423 mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START];
2424 mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1];
Philipp Reisner4a23f262011-01-11 17:42:17 +01002425
2426 dev_info(DEV, "Did not got last syncUUID packet, corrected:\n");
2427 drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
2428
Philipp Reisnerb411b362009-09-25 16:07:19 -07002429 return -1;
2430 }
2431 }
2432
2433 *rule_nr = 60;
2434 self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
2435 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2436 peer = mdev->p_uuid[i] & ~((u64)1);
2437 if (self == peer)
2438 return -2;
2439 }
2440
2441 *rule_nr = 70;
2442 self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
2443 peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
2444 if (self == peer)
2445 return 1;
2446
2447 *rule_nr = 71;
2448 self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
2449 if (self == peer) {
Philipp Reisner4a23f262011-01-11 17:42:17 +01002450 if (mdev->agreed_pro_version < 96 ?
2451 (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) ==
2452 (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) :
2453 self + UUID_NEW_BM_OFFSET == (mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07002454 /* The last P_SYNC_UUID did not get though. Undo the last start of
2455 resync as sync source modifications of our UUIDs. */
2456
2457 if (mdev->agreed_pro_version < 91)
Philipp Reisner4a23f262011-01-11 17:42:17 +01002458 return -1091;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002459
2460 _drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]);
2461 _drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]);
2462
Philipp Reisner4a23f262011-01-11 17:42:17 +01002463 dev_info(DEV, "Last syncUUID did not get through, corrected:\n");
Philipp Reisnerb411b362009-09-25 16:07:19 -07002464 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid,
2465 mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0);
2466
2467 return 1;
2468 }
2469 }
2470
2471
2472 *rule_nr = 80;
Philipp Reisnerd8c2a362009-11-18 15:52:51 +01002473 peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002474 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2475 self = mdev->ldev->md.uuid[i] & ~((u64)1);
2476 if (self == peer)
2477 return 2;
2478 }
2479
2480 *rule_nr = 90;
2481 self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
2482 peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1);
2483 if (self == peer && self != ((u64)0))
2484 return 100;
2485
2486 *rule_nr = 100;
2487 for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
2488 self = mdev->ldev->md.uuid[i] & ~((u64)1);
2489 for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) {
2490 peer = mdev->p_uuid[j] & ~((u64)1);
2491 if (self == peer)
2492 return -100;
2493 }
2494 }
2495
2496 return -1000;
2497}
2498
2499/* drbd_sync_handshake() returns the new conn state on success, or
2500 CONN_MASK (-1) on failure.
2501 */
2502static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role,
2503 enum drbd_disk_state peer_disk) __must_hold(local)
2504{
2505 int hg, rule_nr;
2506 enum drbd_conns rv = C_MASK;
2507 enum drbd_disk_state mydisk;
2508
2509 mydisk = mdev->state.disk;
2510 if (mydisk == D_NEGOTIATING)
2511 mydisk = mdev->new_state_tmp.disk;
2512
2513 dev_info(DEV, "drbd_sync_handshake:\n");
2514 drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, mdev->comm_bm_set, 0);
2515 drbd_uuid_dump(mdev, "peer", mdev->p_uuid,
2516 mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
2517
2518 hg = drbd_uuid_compare(mdev, &rule_nr);
2519
2520 dev_info(DEV, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
2521
2522 if (hg == -1000) {
2523 dev_alert(DEV, "Unrelated data, aborting!\n");
2524 return C_MASK;
2525 }
Philipp Reisner4a23f262011-01-11 17:42:17 +01002526 if (hg < -1000) {
2527 dev_alert(DEV, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002528 return C_MASK;
2529 }
2530
2531 if ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) ||
2532 (peer_disk == D_INCONSISTENT && mydisk > D_INCONSISTENT)) {
2533 int f = (hg == -100) || abs(hg) == 2;
2534 hg = mydisk > D_INCONSISTENT ? 1 : -1;
2535 if (f)
2536 hg = hg*2;
2537 dev_info(DEV, "Becoming sync %s due to disk states.\n",
2538 hg > 0 ? "source" : "target");
2539 }
2540
Adam Gandelman3a11a482010-04-08 16:48:23 -07002541 if (abs(hg) == 100)
2542 drbd_khelper(mdev, "initial-split-brain");
2543
Philipp Reisnerb411b362009-09-25 16:07:19 -07002544 if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) {
2545 int pcount = (mdev->state.role == R_PRIMARY)
2546 + (peer_role == R_PRIMARY);
2547 int forced = (hg == -100);
2548
2549 switch (pcount) {
2550 case 0:
2551 hg = drbd_asb_recover_0p(mdev);
2552 break;
2553 case 1:
2554 hg = drbd_asb_recover_1p(mdev);
2555 break;
2556 case 2:
2557 hg = drbd_asb_recover_2p(mdev);
2558 break;
2559 }
2560 if (abs(hg) < 100) {
2561 dev_warn(DEV, "Split-Brain detected, %d primaries, "
2562 "automatically solved. Sync from %s node\n",
2563 pcount, (hg < 0) ? "peer" : "this");
2564 if (forced) {
2565 dev_warn(DEV, "Doing a full sync, since"
2566 " UUIDs where ambiguous.\n");
2567 hg = hg*2;
2568 }
2569 }
2570 }
2571
2572 if (hg == -100) {
2573 if (mdev->net_conf->want_lose && !(mdev->p_uuid[UI_FLAGS]&1))
2574 hg = -1;
2575 if (!mdev->net_conf->want_lose && (mdev->p_uuid[UI_FLAGS]&1))
2576 hg = 1;
2577
2578 if (abs(hg) < 100)
2579 dev_warn(DEV, "Split-Brain detected, manually solved. "
2580 "Sync from %s node\n",
2581 (hg < 0) ? "peer" : "this");
2582 }
2583
2584 if (hg == -100) {
Lars Ellenberg580b9762010-02-26 23:15:23 +01002585 /* FIXME this log message is not correct if we end up here
2586 * after an attempted attach on a diskless node.
2587 * We just refuse to attach -- well, we drop the "connection"
2588 * to that disk, in a way... */
Adam Gandelman3a11a482010-04-08 16:48:23 -07002589 dev_alert(DEV, "Split-Brain detected but unresolved, dropping connection!\n");
Philipp Reisnerb411b362009-09-25 16:07:19 -07002590 drbd_khelper(mdev, "split-brain");
2591 return C_MASK;
2592 }
2593
2594 if (hg > 0 && mydisk <= D_INCONSISTENT) {
2595 dev_err(DEV, "I shall become SyncSource, but I am inconsistent!\n");
2596 return C_MASK;
2597 }
2598
2599 if (hg < 0 && /* by intention we do not use mydisk here. */
2600 mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) {
2601 switch (mdev->net_conf->rr_conflict) {
2602 case ASB_CALL_HELPER:
2603 drbd_khelper(mdev, "pri-lost");
2604 /* fall through */
2605 case ASB_DISCONNECT:
2606 dev_err(DEV, "I shall become SyncTarget, but I am primary!\n");
2607 return C_MASK;
2608 case ASB_VIOLENTLY:
2609 dev_warn(DEV, "Becoming SyncTarget, violating the stable-data"
2610 "assumption\n");
2611 }
2612 }
2613
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01002614 if (mdev->net_conf->dry_run || test_bit(CONN_DRY_RUN, &mdev->flags)) {
2615 if (hg == 0)
2616 dev_info(DEV, "dry-run connect: No resync, would become Connected immediately.\n");
2617 else
2618 dev_info(DEV, "dry-run connect: Would become %s, doing a %s resync.",
2619 drbd_conn_str(hg > 0 ? C_SYNC_SOURCE : C_SYNC_TARGET),
2620 abs(hg) >= 2 ? "full" : "bit-map based");
2621 return C_MASK;
2622 }
2623
Philipp Reisnerb411b362009-09-25 16:07:19 -07002624 if (abs(hg) >= 2) {
2625 dev_info(DEV, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01002626 if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from sync_handshake",
2627 BM_LOCKED_SET_ALLOWED))
Philipp Reisnerb411b362009-09-25 16:07:19 -07002628 return C_MASK;
2629 }
2630
2631 if (hg > 0) { /* become sync source. */
2632 rv = C_WF_BITMAP_S;
2633 } else if (hg < 0) { /* become sync target */
2634 rv = C_WF_BITMAP_T;
2635 } else {
2636 rv = C_CONNECTED;
2637 if (drbd_bm_total_weight(mdev)) {
2638 dev_info(DEV, "No resync, but %lu bits in bitmap!\n",
2639 drbd_bm_total_weight(mdev));
2640 }
2641 }
2642
2643 return rv;
2644}
2645
2646/* returns 1 if invalid */
2647static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self)
2648{
2649 /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
2650 if ((peer == ASB_DISCARD_REMOTE && self == ASB_DISCARD_LOCAL) ||
2651 (self == ASB_DISCARD_REMOTE && peer == ASB_DISCARD_LOCAL))
2652 return 0;
2653
2654 /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
2655 if (peer == ASB_DISCARD_REMOTE || peer == ASB_DISCARD_LOCAL ||
2656 self == ASB_DISCARD_REMOTE || self == ASB_DISCARD_LOCAL)
2657 return 1;
2658
2659 /* everything else is valid if they are equal on both sides. */
2660 if (peer == self)
2661 return 0;
2662
2663 /* everything es is invalid. */
2664 return 1;
2665}
2666
Philipp Reisner02918be2010-08-20 14:35:10 +02002667static int receive_protocol(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002668{
Philipp Reisner02918be2010-08-20 14:35:10 +02002669 struct p_protocol *p = &mdev->data.rbuf.protocol;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002670 int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01002671 int p_want_lose, p_two_primaries, cf;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002672 char p_integrity_alg[SHARED_SECRET_MAX] = "";
2673
Philipp Reisnerb411b362009-09-25 16:07:19 -07002674 p_proto = be32_to_cpu(p->protocol);
2675 p_after_sb_0p = be32_to_cpu(p->after_sb_0p);
2676 p_after_sb_1p = be32_to_cpu(p->after_sb_1p);
2677 p_after_sb_2p = be32_to_cpu(p->after_sb_2p);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002678 p_two_primaries = be32_to_cpu(p->two_primaries);
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01002679 cf = be32_to_cpu(p->conn_flags);
2680 p_want_lose = cf & CF_WANT_LOSE;
2681
2682 clear_bit(CONN_DRY_RUN, &mdev->flags);
2683
2684 if (cf & CF_DRY_RUN)
2685 set_bit(CONN_DRY_RUN, &mdev->flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002686
2687 if (p_proto != mdev->net_conf->wire_protocol) {
2688 dev_err(DEV, "incompatible communication protocols\n");
2689 goto disconnect;
2690 }
2691
2692 if (cmp_after_sb(p_after_sb_0p, mdev->net_conf->after_sb_0p)) {
2693 dev_err(DEV, "incompatible after-sb-0pri settings\n");
2694 goto disconnect;
2695 }
2696
2697 if (cmp_after_sb(p_after_sb_1p, mdev->net_conf->after_sb_1p)) {
2698 dev_err(DEV, "incompatible after-sb-1pri settings\n");
2699 goto disconnect;
2700 }
2701
2702 if (cmp_after_sb(p_after_sb_2p, mdev->net_conf->after_sb_2p)) {
2703 dev_err(DEV, "incompatible after-sb-2pri settings\n");
2704 goto disconnect;
2705 }
2706
2707 if (p_want_lose && mdev->net_conf->want_lose) {
2708 dev_err(DEV, "both sides have the 'want_lose' flag set\n");
2709 goto disconnect;
2710 }
2711
2712 if (p_two_primaries != mdev->net_conf->two_primaries) {
2713 dev_err(DEV, "incompatible setting of the two-primaries options\n");
2714 goto disconnect;
2715 }
2716
2717 if (mdev->agreed_pro_version >= 87) {
2718 unsigned char *my_alg = mdev->net_conf->integrity_alg;
2719
2720 if (drbd_recv(mdev, p_integrity_alg, data_size) != data_size)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002721 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002722
2723 p_integrity_alg[SHARED_SECRET_MAX-1] = 0;
2724 if (strcmp(p_integrity_alg, my_alg)) {
2725 dev_err(DEV, "incompatible setting of the data-integrity-alg\n");
2726 goto disconnect;
2727 }
2728 dev_info(DEV, "data-integrity-alg: %s\n",
2729 my_alg[0] ? my_alg : (unsigned char *)"<not-used>");
2730 }
2731
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002732 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002733
2734disconnect:
2735 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002736 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002737}
2738
2739/* helper function
2740 * input: alg name, feature name
2741 * return: NULL (alg name was "")
2742 * ERR_PTR(error) if something goes wrong
2743 * or the crypto hash ptr, if it worked out ok. */
2744struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev,
2745 const char *alg, const char *name)
2746{
2747 struct crypto_hash *tfm;
2748
2749 if (!alg[0])
2750 return NULL;
2751
2752 tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC);
2753 if (IS_ERR(tfm)) {
2754 dev_err(DEV, "Can not allocate \"%s\" as %s (reason: %ld)\n",
2755 alg, name, PTR_ERR(tfm));
2756 return tfm;
2757 }
2758 if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) {
2759 crypto_free_hash(tfm);
2760 dev_err(DEV, "\"%s\" is not a digest (%s)\n", alg, name);
2761 return ERR_PTR(-EINVAL);
2762 }
2763 return tfm;
2764}
2765
Philipp Reisner02918be2010-08-20 14:35:10 +02002766static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int packet_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002767{
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002768 int ok = true;
Philipp Reisner02918be2010-08-20 14:35:10 +02002769 struct p_rs_param_95 *p = &mdev->data.rbuf.rs_param_95;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002770 unsigned int header_size, data_size, exp_max_sz;
2771 struct crypto_hash *verify_tfm = NULL;
2772 struct crypto_hash *csums_tfm = NULL;
2773 const int apv = mdev->agreed_pro_version;
Philipp Reisner778f2712010-07-06 11:14:00 +02002774 int *rs_plan_s = NULL;
2775 int fifo_size = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002776
2777 exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param)
2778 : apv == 88 ? sizeof(struct p_rs_param)
2779 + SHARED_SECRET_MAX
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02002780 : apv <= 94 ? sizeof(struct p_rs_param_89)
2781 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002782
Philipp Reisner02918be2010-08-20 14:35:10 +02002783 if (packet_size > exp_max_sz) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07002784 dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n",
Philipp Reisner02918be2010-08-20 14:35:10 +02002785 packet_size, exp_max_sz);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002786 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002787 }
2788
2789 if (apv <= 88) {
Philipp Reisner02918be2010-08-20 14:35:10 +02002790 header_size = sizeof(struct p_rs_param) - sizeof(struct p_header80);
2791 data_size = packet_size - header_size;
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02002792 } else if (apv <= 94) {
Philipp Reisner02918be2010-08-20 14:35:10 +02002793 header_size = sizeof(struct p_rs_param_89) - sizeof(struct p_header80);
2794 data_size = packet_size - header_size;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002795 D_ASSERT(data_size == 0);
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02002796 } else {
Philipp Reisner02918be2010-08-20 14:35:10 +02002797 header_size = sizeof(struct p_rs_param_95) - sizeof(struct p_header80);
2798 data_size = packet_size - header_size;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002799 D_ASSERT(data_size == 0);
2800 }
2801
2802 /* initialize verify_alg and csums_alg */
2803 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
2804
Philipp Reisner02918be2010-08-20 14:35:10 +02002805 if (drbd_recv(mdev, &p->head.payload, header_size) != header_size)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002806 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002807
2808 mdev->sync_conf.rate = be32_to_cpu(p->rate);
2809
2810 if (apv >= 88) {
2811 if (apv == 88) {
2812 if (data_size > SHARED_SECRET_MAX) {
2813 dev_err(DEV, "verify-alg too long, "
2814 "peer wants %u, accepting only %u byte\n",
2815 data_size, SHARED_SECRET_MAX);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002816 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002817 }
2818
2819 if (drbd_recv(mdev, p->verify_alg, data_size) != data_size)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002820 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002821
2822 /* we expect NUL terminated string */
2823 /* but just in case someone tries to be evil */
2824 D_ASSERT(p->verify_alg[data_size-1] == 0);
2825 p->verify_alg[data_size-1] = 0;
2826
2827 } else /* apv >= 89 */ {
2828 /* we still expect NUL terminated strings */
2829 /* but just in case someone tries to be evil */
2830 D_ASSERT(p->verify_alg[SHARED_SECRET_MAX-1] == 0);
2831 D_ASSERT(p->csums_alg[SHARED_SECRET_MAX-1] == 0);
2832 p->verify_alg[SHARED_SECRET_MAX-1] = 0;
2833 p->csums_alg[SHARED_SECRET_MAX-1] = 0;
2834 }
2835
2836 if (strcmp(mdev->sync_conf.verify_alg, p->verify_alg)) {
2837 if (mdev->state.conn == C_WF_REPORT_PARAMS) {
2838 dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
2839 mdev->sync_conf.verify_alg, p->verify_alg);
2840 goto disconnect;
2841 }
2842 verify_tfm = drbd_crypto_alloc_digest_safe(mdev,
2843 p->verify_alg, "verify-alg");
2844 if (IS_ERR(verify_tfm)) {
2845 verify_tfm = NULL;
2846 goto disconnect;
2847 }
2848 }
2849
2850 if (apv >= 89 && strcmp(mdev->sync_conf.csums_alg, p->csums_alg)) {
2851 if (mdev->state.conn == C_WF_REPORT_PARAMS) {
2852 dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
2853 mdev->sync_conf.csums_alg, p->csums_alg);
2854 goto disconnect;
2855 }
2856 csums_tfm = drbd_crypto_alloc_digest_safe(mdev,
2857 p->csums_alg, "csums-alg");
2858 if (IS_ERR(csums_tfm)) {
2859 csums_tfm = NULL;
2860 goto disconnect;
2861 }
2862 }
2863
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02002864 if (apv > 94) {
2865 mdev->sync_conf.rate = be32_to_cpu(p->rate);
2866 mdev->sync_conf.c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
2867 mdev->sync_conf.c_delay_target = be32_to_cpu(p->c_delay_target);
2868 mdev->sync_conf.c_fill_target = be32_to_cpu(p->c_fill_target);
2869 mdev->sync_conf.c_max_rate = be32_to_cpu(p->c_max_rate);
Philipp Reisner778f2712010-07-06 11:14:00 +02002870
2871 fifo_size = (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ;
2872 if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) {
2873 rs_plan_s = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL);
2874 if (!rs_plan_s) {
2875 dev_err(DEV, "kmalloc of fifo_buffer failed");
2876 goto disconnect;
2877 }
2878 }
Philipp Reisner8e26f9c2010-07-06 17:25:54 +02002879 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002880
2881 spin_lock(&mdev->peer_seq_lock);
2882 /* lock against drbd_nl_syncer_conf() */
2883 if (verify_tfm) {
2884 strcpy(mdev->sync_conf.verify_alg, p->verify_alg);
2885 mdev->sync_conf.verify_alg_len = strlen(p->verify_alg) + 1;
2886 crypto_free_hash(mdev->verify_tfm);
2887 mdev->verify_tfm = verify_tfm;
2888 dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg);
2889 }
2890 if (csums_tfm) {
2891 strcpy(mdev->sync_conf.csums_alg, p->csums_alg);
2892 mdev->sync_conf.csums_alg_len = strlen(p->csums_alg) + 1;
2893 crypto_free_hash(mdev->csums_tfm);
2894 mdev->csums_tfm = csums_tfm;
2895 dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg);
2896 }
Philipp Reisner778f2712010-07-06 11:14:00 +02002897 if (fifo_size != mdev->rs_plan_s.size) {
2898 kfree(mdev->rs_plan_s.values);
2899 mdev->rs_plan_s.values = rs_plan_s;
2900 mdev->rs_plan_s.size = fifo_size;
2901 mdev->rs_planed = 0;
2902 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002903 spin_unlock(&mdev->peer_seq_lock);
2904 }
2905
2906 return ok;
2907disconnect:
2908 /* just for completeness: actually not needed,
2909 * as this is not reached if csums_tfm was ok. */
2910 crypto_free_hash(csums_tfm);
2911 /* but free the verify_tfm again, if csums_tfm did not work out */
2912 crypto_free_hash(verify_tfm);
2913 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002914 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002915}
2916
2917static void drbd_setup_order_type(struct drbd_conf *mdev, int peer)
2918{
2919 /* sorry, we currently have no working implementation
2920 * of distributed TCQ */
2921}
2922
2923/* warn if the arguments differ by more than 12.5% */
2924static void warn_if_differ_considerably(struct drbd_conf *mdev,
2925 const char *s, sector_t a, sector_t b)
2926{
2927 sector_t d;
2928 if (a == 0 || b == 0)
2929 return;
2930 d = (a > b) ? (a - b) : (b - a);
2931 if (d > (a>>3) || d > (b>>3))
2932 dev_warn(DEV, "Considerable difference in %s: %llus vs. %llus\n", s,
2933 (unsigned long long)a, (unsigned long long)b);
2934}
2935
Philipp Reisner02918be2010-08-20 14:35:10 +02002936static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07002937{
Philipp Reisner02918be2010-08-20 14:35:10 +02002938 struct p_sizes *p = &mdev->data.rbuf.sizes;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002939 enum determine_dev_size dd = unchanged;
Lars Ellenberg1816a2b2010-11-11 15:19:07 +01002940 unsigned int max_bio_size;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002941 sector_t p_size, p_usize, my_usize;
2942 int ldsc = 0; /* local disk size changed */
Philipp Reisnere89b5912010-03-24 17:11:33 +01002943 enum dds_flags ddsf;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002944
Philipp Reisnerb411b362009-09-25 16:07:19 -07002945 p_size = be64_to_cpu(p->d_size);
2946 p_usize = be64_to_cpu(p->u_size);
2947
2948 if (p_size == 0 && mdev->state.disk == D_DISKLESS) {
2949 dev_err(DEV, "some backing storage is needed\n");
2950 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002951 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002952 }
2953
2954 /* just store the peer's disk size for now.
2955 * we still need to figure out whether we accept that. */
2956 mdev->p_size = p_size;
2957
Philipp Reisnerb411b362009-09-25 16:07:19 -07002958 if (get_ldev(mdev)) {
2959 warn_if_differ_considerably(mdev, "lower level device sizes",
2960 p_size, drbd_get_max_capacity(mdev->ldev));
2961 warn_if_differ_considerably(mdev, "user requested size",
2962 p_usize, mdev->ldev->dc.disk_size);
2963
2964 /* if this is the first connect, or an otherwise expected
2965 * param exchange, choose the minimum */
2966 if (mdev->state.conn == C_WF_REPORT_PARAMS)
2967 p_usize = min_not_zero((sector_t)mdev->ldev->dc.disk_size,
2968 p_usize);
2969
2970 my_usize = mdev->ldev->dc.disk_size;
2971
2972 if (mdev->ldev->dc.disk_size != p_usize) {
2973 mdev->ldev->dc.disk_size = p_usize;
2974 dev_info(DEV, "Peer sets u_size to %lu sectors\n",
2975 (unsigned long)mdev->ldev->dc.disk_size);
2976 }
2977
2978 /* Never shrink a device with usable data during connect.
2979 But allow online shrinking if we are connected. */
Philipp Reisnera393db62009-12-22 13:35:52 +01002980 if (drbd_new_dev_size(mdev, mdev->ldev, 0) <
Philipp Reisnerb411b362009-09-25 16:07:19 -07002981 drbd_get_capacity(mdev->this_bdev) &&
2982 mdev->state.disk >= D_OUTDATED &&
2983 mdev->state.conn < C_CONNECTED) {
2984 dev_err(DEV, "The peer's disk size is too small!\n");
2985 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
2986 mdev->ldev->dc.disk_size = my_usize;
2987 put_ldev(mdev);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002988 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002989 }
2990 put_ldev(mdev);
2991 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07002992
Philipp Reisnere89b5912010-03-24 17:11:33 +01002993 ddsf = be16_to_cpu(p->dds_flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002994 if (get_ldev(mdev)) {
Philipp Reisnere89b5912010-03-24 17:11:33 +01002995 dd = drbd_determin_dev_size(mdev, ddsf);
Philipp Reisnerb411b362009-09-25 16:07:19 -07002996 put_ldev(mdev);
2997 if (dd == dev_size_error)
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01002998 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07002999 drbd_md_sync(mdev);
3000 } else {
3001 /* I am diskless, need to accept the peer's size. */
3002 drbd_set_my_capacity(mdev, p_size);
3003 }
3004
Philipp Reisnerb411b362009-09-25 16:07:19 -07003005 if (get_ldev(mdev)) {
3006 if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
3007 mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
3008 ldsc = 1;
3009 }
3010
Lars Ellenberga1c88d02010-05-14 19:16:41 +02003011 if (mdev->agreed_pro_version < 94)
Lars Ellenberg1816a2b2010-11-11 15:19:07 +01003012 max_bio_size = be32_to_cpu(p->max_bio_size);
Lars Ellenberg8979d9c2010-09-14 15:56:29 +02003013 else if (mdev->agreed_pro_version == 94)
Lars Ellenberg1816a2b2010-11-11 15:19:07 +01003014 max_bio_size = DRBD_MAX_SIZE_H80_PACKET;
Lars Ellenberga1c88d02010-05-14 19:16:41 +02003015 else /* drbd 8.3.8 onwards */
Lars Ellenberg1816a2b2010-11-11 15:19:07 +01003016 max_bio_size = DRBD_MAX_BIO_SIZE;
Lars Ellenberga1c88d02010-05-14 19:16:41 +02003017
Lars Ellenberg1816a2b2010-11-11 15:19:07 +01003018 if (max_bio_size != queue_max_hw_sectors(mdev->rq_queue) << 9)
3019 drbd_setup_queue_param(mdev, max_bio_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003020
Philipp Reisnere89b5912010-03-24 17:11:33 +01003021 drbd_setup_order_type(mdev, be16_to_cpu(p->queue_order_type));
Philipp Reisnerb411b362009-09-25 16:07:19 -07003022 put_ldev(mdev);
3023 }
3024
3025 if (mdev->state.conn > C_WF_REPORT_PARAMS) {
3026 if (be64_to_cpu(p->c_size) !=
3027 drbd_get_capacity(mdev->this_bdev) || ldsc) {
3028 /* we have different sizes, probably peer
3029 * needs to know my new size... */
Philipp Reisnere89b5912010-03-24 17:11:33 +01003030 drbd_send_sizes(mdev, 0, ddsf);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003031 }
3032 if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) ||
3033 (dd == grew && mdev->state.conn == C_CONNECTED)) {
3034 if (mdev->state.pdsk >= D_INCONSISTENT &&
Philipp Reisnere89b5912010-03-24 17:11:33 +01003035 mdev->state.disk >= D_INCONSISTENT) {
3036 if (ddsf & DDSF_NO_RESYNC)
3037 dev_info(DEV, "Resync of new storage suppressed with --assume-clean\n");
3038 else
3039 resync_after_online_grow(mdev);
3040 } else
Philipp Reisnerb411b362009-09-25 16:07:19 -07003041 set_bit(RESYNC_AFTER_NEG, &mdev->flags);
3042 }
3043 }
3044
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003045 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003046}
3047
Philipp Reisner02918be2010-08-20 14:35:10 +02003048static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003049{
Philipp Reisner02918be2010-08-20 14:35:10 +02003050 struct p_uuids *p = &mdev->data.rbuf.uuids;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003051 u64 *p_uuid;
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003052 int i, updated_uuids = 0;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003053
Philipp Reisnerb411b362009-09-25 16:07:19 -07003054 p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
3055
3056 for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
3057 p_uuid[i] = be64_to_cpu(p->uuid[i]);
3058
3059 kfree(mdev->p_uuid);
3060 mdev->p_uuid = p_uuid;
3061
3062 if (mdev->state.conn < C_CONNECTED &&
3063 mdev->state.disk < D_INCONSISTENT &&
3064 mdev->state.role == R_PRIMARY &&
3065 (mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
3066 dev_err(DEV, "Can only connect to data with current UUID=%016llX\n",
3067 (unsigned long long)mdev->ed_uuid);
3068 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003069 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003070 }
3071
3072 if (get_ldev(mdev)) {
3073 int skip_initial_sync =
3074 mdev->state.conn == C_CONNECTED &&
3075 mdev->agreed_pro_version >= 90 &&
3076 mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
3077 (p_uuid[UI_FLAGS] & 8);
3078 if (skip_initial_sync) {
3079 dev_info(DEV, "Accepted new current UUID, preparing to skip initial sync\n");
3080 drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write,
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01003081 "clear_n_write from receive_uuids",
3082 BM_LOCKED_TEST_ALLOWED);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003083 _drbd_uuid_set(mdev, UI_CURRENT, p_uuid[UI_CURRENT]);
3084 _drbd_uuid_set(mdev, UI_BITMAP, 0);
3085 _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
3086 CS_VERBOSE, NULL);
3087 drbd_md_sync(mdev);
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003088 updated_uuids = 1;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003089 }
3090 put_ldev(mdev);
Philipp Reisner18a50fa2010-06-21 14:14:15 +02003091 } else if (mdev->state.disk < D_INCONSISTENT &&
3092 mdev->state.role == R_PRIMARY) {
3093 /* I am a diskless primary, the peer just created a new current UUID
3094 for me. */
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003095 updated_uuids = drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003096 }
3097
3098 /* Before we test for the disk state, we should wait until an eventually
3099 ongoing cluster wide state change is finished. That is important if
3100 we are primary and are detaching from our disk. We need to see the
3101 new disk state... */
3102 wait_event(mdev->misc_wait, !test_bit(CLUSTER_ST_CHANGE, &mdev->flags));
3103 if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT)
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003104 updated_uuids |= drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
3105
3106 if (updated_uuids)
3107 drbd_print_uuids(mdev, "receiver updated UUIDs to");
Philipp Reisnerb411b362009-09-25 16:07:19 -07003108
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003109 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003110}
3111
3112/**
3113 * convert_state() - Converts the peer's view of the cluster state to our point of view
3114 * @ps: The state as seen by the peer.
3115 */
3116static union drbd_state convert_state(union drbd_state ps)
3117{
3118 union drbd_state ms;
3119
3120 static enum drbd_conns c_tab[] = {
3121 [C_CONNECTED] = C_CONNECTED,
3122
3123 [C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
3124 [C_STARTING_SYNC_T] = C_STARTING_SYNC_S,
3125 [C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */
3126 [C_VERIFY_S] = C_VERIFY_T,
3127 [C_MASK] = C_MASK,
3128 };
3129
3130 ms.i = ps.i;
3131
3132 ms.conn = c_tab[ps.conn];
3133 ms.peer = ps.role;
3134 ms.role = ps.peer;
3135 ms.pdsk = ps.disk;
3136 ms.disk = ps.pdsk;
3137 ms.peer_isp = (ps.aftr_isp | ps.user_isp);
3138
3139 return ms;
3140}
3141
Philipp Reisner02918be2010-08-20 14:35:10 +02003142static int receive_req_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003143{
Philipp Reisner02918be2010-08-20 14:35:10 +02003144 struct p_req_state *p = &mdev->data.rbuf.req_state;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003145 union drbd_state mask, val;
Andreas Gruenbacherbf885f82010-12-08 00:39:32 +01003146 enum drbd_state_rv rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003147
Philipp Reisnerb411b362009-09-25 16:07:19 -07003148 mask.i = be32_to_cpu(p->mask);
3149 val.i = be32_to_cpu(p->val);
3150
3151 if (test_bit(DISCARD_CONCURRENT, &mdev->flags) &&
3152 test_bit(CLUSTER_ST_CHANGE, &mdev->flags)) {
3153 drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003154 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003155 }
3156
3157 mask = convert_state(mask);
3158 val = convert_state(val);
3159
3160 rv = drbd_change_state(mdev, CS_VERBOSE, mask, val);
3161
3162 drbd_send_sr_reply(mdev, rv);
3163 drbd_md_sync(mdev);
3164
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003165 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003166}
3167
Philipp Reisner02918be2010-08-20 14:35:10 +02003168static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003169{
Philipp Reisner02918be2010-08-20 14:35:10 +02003170 struct p_state *p = &mdev->data.rbuf.state;
Lars Ellenberg4ac4aad2010-07-22 17:39:26 +02003171 union drbd_state os, ns, peer_state;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003172 enum drbd_disk_state real_peer_disk;
Philipp Reisner65d922c2010-06-16 16:18:09 +02003173 enum chg_state_flags cs_flags;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003174 int rv;
3175
Philipp Reisnerb411b362009-09-25 16:07:19 -07003176 peer_state.i = be32_to_cpu(p->state);
3177
3178 real_peer_disk = peer_state.disk;
3179 if (peer_state.disk == D_NEGOTIATING) {
3180 real_peer_disk = mdev->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT;
3181 dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
3182 }
3183
3184 spin_lock_irq(&mdev->req_lock);
3185 retry:
Lars Ellenberg4ac4aad2010-07-22 17:39:26 +02003186 os = ns = mdev->state;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003187 spin_unlock_irq(&mdev->req_lock);
3188
Lars Ellenberge9ef7bb2010-10-07 15:55:39 +02003189 /* peer says his disk is uptodate, while we think it is inconsistent,
3190 * and this happens while we think we have a sync going on. */
3191 if (os.pdsk == D_INCONSISTENT && real_peer_disk == D_UP_TO_DATE &&
3192 os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
3193 /* If we are (becoming) SyncSource, but peer is still in sync
3194 * preparation, ignore its uptodate-ness to avoid flapping, it
3195 * will change to inconsistent once the peer reaches active
3196 * syncing states.
3197 * It may have changed syncer-paused flags, however, so we
3198 * cannot ignore this completely. */
3199 if (peer_state.conn > C_CONNECTED &&
3200 peer_state.conn < C_SYNC_SOURCE)
3201 real_peer_disk = D_INCONSISTENT;
3202
3203 /* if peer_state changes to connected at the same time,
3204 * it explicitly notifies us that it finished resync.
3205 * Maybe we should finish it up, too? */
3206 else if (os.conn >= C_SYNC_SOURCE &&
3207 peer_state.conn == C_CONNECTED) {
3208 if (drbd_bm_total_weight(mdev) <= mdev->rs_failed)
3209 drbd_resync_finished(mdev);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003210 return true;
Lars Ellenberge9ef7bb2010-10-07 15:55:39 +02003211 }
3212 }
3213
3214 /* peer says his disk is inconsistent, while we think it is uptodate,
3215 * and this happens while the peer still thinks we have a sync going on,
3216 * but we think we are already done with the sync.
3217 * We ignore this to avoid flapping pdsk.
3218 * This should not happen, if the peer is a recent version of drbd. */
3219 if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT &&
3220 os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE)
3221 real_peer_disk = D_UP_TO_DATE;
3222
Lars Ellenberg4ac4aad2010-07-22 17:39:26 +02003223 if (ns.conn == C_WF_REPORT_PARAMS)
3224 ns.conn = C_CONNECTED;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003225
Philipp Reisner67531712010-10-27 12:21:30 +02003226 if (peer_state.conn == C_AHEAD)
3227 ns.conn = C_BEHIND;
3228
Philipp Reisnerb411b362009-09-25 16:07:19 -07003229 if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING &&
3230 get_ldev_if_state(mdev, D_NEGOTIATING)) {
3231 int cr; /* consider resync */
3232
3233 /* if we established a new connection */
Lars Ellenberg4ac4aad2010-07-22 17:39:26 +02003234 cr = (os.conn < C_CONNECTED);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003235 /* if we had an established connection
3236 * and one of the nodes newly attaches a disk */
Lars Ellenberg4ac4aad2010-07-22 17:39:26 +02003237 cr |= (os.conn == C_CONNECTED &&
Philipp Reisnerb411b362009-09-25 16:07:19 -07003238 (peer_state.disk == D_NEGOTIATING ||
Lars Ellenberg4ac4aad2010-07-22 17:39:26 +02003239 os.disk == D_NEGOTIATING));
Philipp Reisnerb411b362009-09-25 16:07:19 -07003240 /* if we have both been inconsistent, and the peer has been
3241 * forced to be UpToDate with --overwrite-data */
3242 cr |= test_bit(CONSIDER_RESYNC, &mdev->flags);
3243 /* if we had been plain connected, and the admin requested to
3244 * start a sync by "invalidate" or "invalidate-remote" */
Lars Ellenberg4ac4aad2010-07-22 17:39:26 +02003245 cr |= (os.conn == C_CONNECTED &&
Philipp Reisnerb411b362009-09-25 16:07:19 -07003246 (peer_state.conn >= C_STARTING_SYNC_S &&
3247 peer_state.conn <= C_WF_BITMAP_T));
3248
3249 if (cr)
Lars Ellenberg4ac4aad2010-07-22 17:39:26 +02003250 ns.conn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003251
3252 put_ldev(mdev);
Lars Ellenberg4ac4aad2010-07-22 17:39:26 +02003253 if (ns.conn == C_MASK) {
3254 ns.conn = C_CONNECTED;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003255 if (mdev->state.disk == D_NEGOTIATING) {
Lars Ellenberg82f59cc2010-10-16 12:13:47 +02003256 drbd_force_state(mdev, NS(disk, D_FAILED));
Philipp Reisnerb411b362009-09-25 16:07:19 -07003257 } else if (peer_state.disk == D_NEGOTIATING) {
3258 dev_err(DEV, "Disk attach process on the peer node was aborted.\n");
3259 peer_state.disk = D_DISKLESS;
Lars Ellenberg580b9762010-02-26 23:15:23 +01003260 real_peer_disk = D_DISKLESS;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003261 } else {
Philipp Reisnercf14c2e2010-02-02 21:03:50 +01003262 if (test_and_clear_bit(CONN_DRY_RUN, &mdev->flags))
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003263 return false;
Lars Ellenberg4ac4aad2010-07-22 17:39:26 +02003264 D_ASSERT(os.conn == C_WF_REPORT_PARAMS);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003265 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003266 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003267 }
3268 }
3269 }
3270
3271 spin_lock_irq(&mdev->req_lock);
Lars Ellenberg4ac4aad2010-07-22 17:39:26 +02003272 if (mdev->state.i != os.i)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003273 goto retry;
3274 clear_bit(CONSIDER_RESYNC, &mdev->flags);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003275 ns.peer = peer_state.role;
3276 ns.pdsk = real_peer_disk;
3277 ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
Lars Ellenberg4ac4aad2010-07-22 17:39:26 +02003278 if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003279 ns.disk = mdev->new_state_tmp.disk;
Lars Ellenberg4ac4aad2010-07-22 17:39:26 +02003280 cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
3281 if (ns.pdsk == D_CONSISTENT && is_susp(ns) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
Philipp Reisner481c6f52010-06-22 14:03:27 +02003282 test_bit(NEW_CUR_UUID, &mdev->flags)) {
3283 /* Do not allow tl_restart(resend) for a rebooted peer. We can only allow this
3284 for temporal network outages! */
3285 spin_unlock_irq(&mdev->req_lock);
3286 dev_err(DEV, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
3287 tl_clear(mdev);
3288 drbd_uuid_new_current(mdev);
3289 clear_bit(NEW_CUR_UUID, &mdev->flags);
3290 drbd_force_state(mdev, NS2(conn, C_PROTOCOL_ERROR, susp, 0));
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003291 return false;
Philipp Reisner481c6f52010-06-22 14:03:27 +02003292 }
Philipp Reisner65d922c2010-06-16 16:18:09 +02003293 rv = _drbd_set_state(mdev, ns, cs_flags, NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003294 ns = mdev->state;
3295 spin_unlock_irq(&mdev->req_lock);
3296
3297 if (rv < SS_SUCCESS) {
3298 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003299 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003300 }
3301
Lars Ellenberg4ac4aad2010-07-22 17:39:26 +02003302 if (os.conn > C_WF_REPORT_PARAMS) {
3303 if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
Philipp Reisnerb411b362009-09-25 16:07:19 -07003304 peer_state.disk != D_NEGOTIATING ) {
3305 /* we want resync, peer has not yet decided to sync... */
3306 /* Nowadays only used when forcing a node into primary role and
3307 setting its disk to UpToDate with that */
3308 drbd_send_uuids(mdev);
3309 drbd_send_state(mdev);
3310 }
3311 }
3312
3313 mdev->net_conf->want_lose = 0;
3314
3315 drbd_md_sync(mdev); /* update connected indicator, la_size, ... */
3316
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003317 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003318}
3319
Philipp Reisner02918be2010-08-20 14:35:10 +02003320static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003321{
Philipp Reisner02918be2010-08-20 14:35:10 +02003322 struct p_rs_uuid *p = &mdev->data.rbuf.rs_uuid;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003323
3324 wait_event(mdev->misc_wait,
3325 mdev->state.conn == C_WF_SYNC_UUID ||
Philipp Reisnerc4752ef2010-10-27 17:32:36 +02003326 mdev->state.conn == C_BEHIND ||
Philipp Reisnerb411b362009-09-25 16:07:19 -07003327 mdev->state.conn < C_CONNECTED ||
3328 mdev->state.disk < D_NEGOTIATING);
3329
3330 /* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */
3331
Philipp Reisnerb411b362009-09-25 16:07:19 -07003332 /* Here the _drbd_uuid_ functions are right, current should
3333 _not_ be rotated into the history */
3334 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
3335 _drbd_uuid_set(mdev, UI_CURRENT, be64_to_cpu(p->uuid));
3336 _drbd_uuid_set(mdev, UI_BITMAP, 0UL);
3337
Lars Ellenberg62b0da32011-01-20 13:25:21 +01003338 drbd_print_uuids(mdev, "updated sync uuid");
Philipp Reisnerb411b362009-09-25 16:07:19 -07003339 drbd_start_resync(mdev, C_SYNC_TARGET);
3340
3341 put_ldev(mdev);
3342 } else
3343 dev_err(DEV, "Ignoring SyncUUID packet!\n");
3344
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003345 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003346}
3347
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003348/**
3349 * receive_bitmap_plain
3350 *
3351 * Return 0 when done, 1 when another iteration is needed, and a negative error
3352 * code upon failure.
3353 */
3354static int
Philipp Reisner02918be2010-08-20 14:35:10 +02003355receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size,
3356 unsigned long *buffer, struct bm_xfer_ctx *c)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003357{
3358 unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
3359 unsigned want = num_words * sizeof(long);
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003360 int err;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003361
Philipp Reisner02918be2010-08-20 14:35:10 +02003362 if (want != data_size) {
3363 dev_err(DEV, "%s:want (%u) != data_size (%u)\n", __func__, want, data_size);
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003364 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003365 }
3366 if (want == 0)
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003367 return 0;
3368 err = drbd_recv(mdev, buffer, want);
3369 if (err != want) {
3370 if (err >= 0)
3371 err = -EIO;
3372 return err;
3373 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07003374
3375 drbd_bm_merge_lel(mdev, c->word_offset, num_words, buffer);
3376
3377 c->word_offset += num_words;
3378 c->bit_offset = c->word_offset * BITS_PER_LONG;
3379 if (c->bit_offset > c->bm_bits)
3380 c->bit_offset = c->bm_bits;
3381
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003382 return 1;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003383}
3384
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003385/**
3386 * recv_bm_rle_bits
3387 *
3388 * Return 0 when done, 1 when another iteration is needed, and a negative error
3389 * code upon failure.
3390 */
3391static int
Philipp Reisnerb411b362009-09-25 16:07:19 -07003392recv_bm_rle_bits(struct drbd_conf *mdev,
3393 struct p_compressed_bm *p,
3394 struct bm_xfer_ctx *c)
3395{
3396 struct bitstream bs;
3397 u64 look_ahead;
3398 u64 rl;
3399 u64 tmp;
3400 unsigned long s = c->bit_offset;
3401 unsigned long e;
Lars Ellenberg004352f2010-10-05 20:13:58 +02003402 int len = be16_to_cpu(p->head.length) - (sizeof(*p) - sizeof(p->head));
Philipp Reisnerb411b362009-09-25 16:07:19 -07003403 int toggle = DCBP_get_start(p);
3404 int have;
3405 int bits;
3406
3407 bitstream_init(&bs, p->code, len, DCBP_get_pad_bits(p));
3408
3409 bits = bitstream_get_bits(&bs, &look_ahead, 64);
3410 if (bits < 0)
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003411 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003412
3413 for (have = bits; have > 0; s += rl, toggle = !toggle) {
3414 bits = vli_decode_bits(&rl, look_ahead);
3415 if (bits <= 0)
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003416 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003417
3418 if (toggle) {
3419 e = s + rl -1;
3420 if (e >= c->bm_bits) {
3421 dev_err(DEV, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003422 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003423 }
3424 _drbd_bm_set_bits(mdev, s, e);
3425 }
3426
3427 if (have < bits) {
3428 dev_err(DEV, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n",
3429 have, bits, look_ahead,
3430 (unsigned int)(bs.cur.b - p->code),
3431 (unsigned int)bs.buf_len);
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003432 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003433 }
3434 look_ahead >>= bits;
3435 have -= bits;
3436
3437 bits = bitstream_get_bits(&bs, &tmp, 64 - have);
3438 if (bits < 0)
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003439 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003440 look_ahead |= tmp << have;
3441 have += bits;
3442 }
3443
3444 c->bit_offset = s;
3445 bm_xfer_ctx_bit_to_word_offset(c);
3446
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003447 return (s != c->bm_bits);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003448}
3449
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003450/**
3451 * decode_bitmap_c
3452 *
3453 * Return 0 when done, 1 when another iteration is needed, and a negative error
3454 * code upon failure.
3455 */
3456static int
Philipp Reisnerb411b362009-09-25 16:07:19 -07003457decode_bitmap_c(struct drbd_conf *mdev,
3458 struct p_compressed_bm *p,
3459 struct bm_xfer_ctx *c)
3460{
3461 if (DCBP_get_code(p) == RLE_VLI_Bits)
3462 return recv_bm_rle_bits(mdev, p, c);
3463
3464 /* other variants had been implemented for evaluation,
3465 * but have been dropped as this one turned out to be "best"
3466 * during all our tests. */
3467
3468 dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
3469 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003470 return -EIO;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003471}
3472
3473void INFO_bm_xfer_stats(struct drbd_conf *mdev,
3474 const char *direction, struct bm_xfer_ctx *c)
3475{
3476 /* what would it take to transfer it "plaintext" */
Philipp Reisner0b70a132010-08-20 13:36:10 +02003477 unsigned plain = sizeof(struct p_header80) *
Philipp Reisnerb411b362009-09-25 16:07:19 -07003478 ((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1)
3479 + c->bm_words * sizeof(long);
3480 unsigned total = c->bytes[0] + c->bytes[1];
3481 unsigned r;
3482
3483 /* total can not be zero. but just in case: */
3484 if (total == 0)
3485 return;
3486
3487 /* don't report if not compressed */
3488 if (total >= plain)
3489 return;
3490
3491 /* total < plain. check for overflow, still */
3492 r = (total > UINT_MAX/1000) ? (total / (plain/1000))
3493 : (1000 * total / plain);
3494
3495 if (r > 1000)
3496 r = 1000;
3497
3498 r = 1000 - r;
3499 dev_info(DEV, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), "
3500 "total %u; compression: %u.%u%%\n",
3501 direction,
3502 c->bytes[1], c->packets[1],
3503 c->bytes[0], c->packets[0],
3504 total, r/10, r % 10);
3505}
3506
3507/* Since we are processing the bitfield from lower addresses to higher,
3508 it does not matter if the process it in 32 bit chunks or 64 bit
3509 chunks as long as it is little endian. (Understand it as byte stream,
3510 beginning with the lowest byte...) If we would use big endian
3511 we would need to process it from the highest address to the lowest,
3512 in order to be agnostic to the 32 vs 64 bits issue.
3513
3514 returns 0 on failure, 1 if we successfully received it. */
Philipp Reisner02918be2010-08-20 14:35:10 +02003515static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003516{
3517 struct bm_xfer_ctx c;
3518 void *buffer;
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003519 int err;
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003520 int ok = false;
Philipp Reisner02918be2010-08-20 14:35:10 +02003521 struct p_header80 *h = &mdev->data.rbuf.header.h80;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003522
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01003523 drbd_bm_lock(mdev, "receive bitmap", BM_LOCKED_SET_ALLOWED);
3524 /* you are supposed to send additional out-of-sync information
3525 * if you actually set bits during this phase */
Philipp Reisnerb411b362009-09-25 16:07:19 -07003526
3527 /* maybe we should use some per thread scratch page,
3528 * and allocate that during initial device creation? */
3529 buffer = (unsigned long *) __get_free_page(GFP_NOIO);
3530 if (!buffer) {
3531 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
3532 goto out;
3533 }
3534
3535 c = (struct bm_xfer_ctx) {
3536 .bm_bits = drbd_bm_bits(mdev),
3537 .bm_words = drbd_bm_words(mdev),
3538 };
3539
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003540 for(;;) {
Philipp Reisner02918be2010-08-20 14:35:10 +02003541 if (cmd == P_BITMAP) {
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003542 err = receive_bitmap_plain(mdev, data_size, buffer, &c);
Philipp Reisner02918be2010-08-20 14:35:10 +02003543 } else if (cmd == P_COMPRESSED_BITMAP) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003544 /* MAYBE: sanity check that we speak proto >= 90,
3545 * and the feature is enabled! */
3546 struct p_compressed_bm *p;
3547
Philipp Reisner02918be2010-08-20 14:35:10 +02003548 if (data_size > BM_PACKET_PAYLOAD_BYTES) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003549 dev_err(DEV, "ReportCBitmap packet too large\n");
3550 goto out;
3551 }
3552 /* use the page buff */
3553 p = buffer;
3554 memcpy(p, h, sizeof(*h));
Philipp Reisner02918be2010-08-20 14:35:10 +02003555 if (drbd_recv(mdev, p->head.payload, data_size) != data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003556 goto out;
Lars Ellenberg004352f2010-10-05 20:13:58 +02003557 if (data_size <= (sizeof(*p) - sizeof(p->head))) {
3558 dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", data_size);
Andreas Gruenbacher78fcbda2010-12-10 22:18:27 +01003559 goto out;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003560 }
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003561 err = decode_bitmap_c(mdev, p, &c);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003562 } else {
Philipp Reisner02918be2010-08-20 14:35:10 +02003563 dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", cmd);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003564 goto out;
3565 }
3566
Philipp Reisner02918be2010-08-20 14:35:10 +02003567 c.packets[cmd == P_BITMAP]++;
3568 c.bytes[cmd == P_BITMAP] += sizeof(struct p_header80) + data_size;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003569
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003570 if (err <= 0) {
3571 if (err < 0)
3572 goto out;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003573 break;
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003574 }
Philipp Reisner02918be2010-08-20 14:35:10 +02003575 if (!drbd_recv_header(mdev, &cmd, &data_size))
Philipp Reisnerb411b362009-09-25 16:07:19 -07003576 goto out;
Andreas Gruenbacher2c464072010-12-11 21:53:12 +01003577 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07003578
3579 INFO_bm_xfer_stats(mdev, "receive", &c);
3580
3581 if (mdev->state.conn == C_WF_BITMAP_T) {
Andreas Gruenbacherde1f8e42010-12-10 21:04:00 +01003582 enum drbd_state_rv rv;
3583
Philipp Reisnerb411b362009-09-25 16:07:19 -07003584 ok = !drbd_send_bitmap(mdev);
3585 if (!ok)
3586 goto out;
3587 /* Omit CS_ORDERED with this state transition to avoid deadlocks. */
Andreas Gruenbacherde1f8e42010-12-10 21:04:00 +01003588 rv = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
3589 D_ASSERT(rv == SS_SUCCESS);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003590 } else if (mdev->state.conn != C_WF_BITMAP_S) {
3591 /* admin may have requested C_DISCONNECTING,
3592 * other threads may have noticed network errors */
3593 dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n",
3594 drbd_conn_str(mdev->state.conn));
3595 }
3596
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003597 ok = true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003598 out:
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01003599 drbd_bm_unlock(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003600 if (ok && mdev->state.conn == C_WF_BITMAP_S)
3601 drbd_start_resync(mdev, C_SYNC_SOURCE);
3602 free_page((unsigned long) buffer);
3603 return ok;
3604}
3605
Philipp Reisner02918be2010-08-20 14:35:10 +02003606static int receive_skip(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003607{
3608 /* TODO zero copy sink :) */
3609 static char sink[128];
3610 int size, want, r;
3611
Philipp Reisner02918be2010-08-20 14:35:10 +02003612 dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n",
3613 cmd, data_size);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003614
Philipp Reisner02918be2010-08-20 14:35:10 +02003615 size = data_size;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003616 while (size > 0) {
3617 want = min_t(int, size, sizeof(sink));
3618 r = drbd_recv(mdev, sink, want);
3619 ERR_IF(r <= 0) break;
3620 size -= r;
3621 }
3622 return size == 0;
3623}
3624
Philipp Reisner02918be2010-08-20 14:35:10 +02003625static int receive_UnplugRemote(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
Philipp Reisnerb411b362009-09-25 16:07:19 -07003626{
Philipp Reisnerb411b362009-09-25 16:07:19 -07003627 /* Make sure we've acked all the TCP data associated
3628 * with the data requests being unplugged */
3629 drbd_tcp_quickack(mdev->data.socket);
3630
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003631 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003632}
3633
Philipp Reisner73a01a12010-10-27 14:33:00 +02003634static int receive_out_of_sync(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
3635{
3636 struct p_block_desc *p = &mdev->data.rbuf.block_desc;
3637
Lars Ellenbergf735e3632010-12-17 21:06:18 +01003638 switch (mdev->state.conn) {
3639 case C_WF_SYNC_UUID:
3640 case C_WF_BITMAP_T:
3641 case C_BEHIND:
3642 break;
3643 default:
3644 dev_err(DEV, "ASSERT FAILED cstate = %s, expected: WFSyncUUID|WFBitMapT|Behind\n",
3645 drbd_conn_str(mdev->state.conn));
3646 }
3647
Philipp Reisner73a01a12010-10-27 14:33:00 +02003648 drbd_set_out_of_sync(mdev, be64_to_cpu(p->sector), be32_to_cpu(p->blksize));
3649
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01003650 return true;
Philipp Reisner73a01a12010-10-27 14:33:00 +02003651}
3652
Philipp Reisner02918be2010-08-20 14:35:10 +02003653typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, enum drbd_packets cmd, unsigned int to_receive);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003654
Philipp Reisner02918be2010-08-20 14:35:10 +02003655struct data_cmd {
3656 int expect_payload;
3657 size_t pkt_size;
3658 drbd_cmd_handler_f function;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003659};
3660
Philipp Reisner02918be2010-08-20 14:35:10 +02003661static struct data_cmd drbd_cmd_handler[] = {
3662 [P_DATA] = { 1, sizeof(struct p_data), receive_Data },
3663 [P_DATA_REPLY] = { 1, sizeof(struct p_data), receive_DataReply },
3664 [P_RS_DATA_REPLY] = { 1, sizeof(struct p_data), receive_RSDataReply } ,
3665 [P_BARRIER] = { 0, sizeof(struct p_barrier), receive_Barrier } ,
3666 [P_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } ,
3667 [P_COMPRESSED_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } ,
3668 [P_UNPLUG_REMOTE] = { 0, sizeof(struct p_header80), receive_UnplugRemote },
3669 [P_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
3670 [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
3671 [P_SYNC_PARAM] = { 1, sizeof(struct p_header80), receive_SyncParam },
3672 [P_SYNC_PARAM89] = { 1, sizeof(struct p_header80), receive_SyncParam },
3673 [P_PROTOCOL] = { 1, sizeof(struct p_protocol), receive_protocol },
3674 [P_UUIDS] = { 0, sizeof(struct p_uuids), receive_uuids },
3675 [P_SIZES] = { 0, sizeof(struct p_sizes), receive_sizes },
3676 [P_STATE] = { 0, sizeof(struct p_state), receive_state },
3677 [P_STATE_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_state },
3678 [P_SYNC_UUID] = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid },
3679 [P_OV_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
3680 [P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest },
3681 [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
3682 [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip },
Philipp Reisner73a01a12010-10-27 14:33:00 +02003683 [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
Philipp Reisner02918be2010-08-20 14:35:10 +02003684 /* anything missing from this table is in
3685 * the asender_tbl, see get_asender_cmd */
3686 [P_MAX_CMD] = { 0, 0, NULL },
3687};
3688
3689/* All handler functions that expect a sub-header get that sub-heder in
3690 mdev->data.rbuf.header.head.payload.
3691
3692 Usually in mdev->data.rbuf.header.head the callback can find the usual
3693 p_header, but they may not rely on that. Since there is also p_header95 !
3694 */
Philipp Reisnerb411b362009-09-25 16:07:19 -07003695
3696static void drbdd(struct drbd_conf *mdev)
3697{
Philipp Reisner02918be2010-08-20 14:35:10 +02003698 union p_header *header = &mdev->data.rbuf.header;
3699 unsigned int packet_size;
3700 enum drbd_packets cmd;
3701 size_t shs; /* sub header size */
3702 int rv;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003703
3704 while (get_t_state(&mdev->receiver) == Running) {
3705 drbd_thread_current_set_cpu(mdev);
Philipp Reisner02918be2010-08-20 14:35:10 +02003706 if (!drbd_recv_header(mdev, &cmd, &packet_size))
3707 goto err_out;
3708
3709 if (unlikely(cmd >= P_MAX_CMD || !drbd_cmd_handler[cmd].function)) {
3710 dev_err(DEV, "unknown packet type %d, l: %d!\n", cmd, packet_size);
3711 goto err_out;
Lars Ellenberg0b33a912009-11-16 15:58:04 +01003712 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07003713
Philipp Reisner02918be2010-08-20 14:35:10 +02003714 shs = drbd_cmd_handler[cmd].pkt_size - sizeof(union p_header);
Philipp Reisner02918be2010-08-20 14:35:10 +02003715 if (packet_size - shs > 0 && !drbd_cmd_handler[cmd].expect_payload) {
3716 dev_err(DEV, "No payload expected %s l:%d\n", cmdname(cmd), packet_size);
3717 goto err_out;
3718 }
3719
Lars Ellenbergc13f7e12010-10-29 23:32:01 +02003720 if (shs) {
3721 rv = drbd_recv(mdev, &header->h80.payload, shs);
3722 if (unlikely(rv != shs)) {
Lars Ellenberg0ddc5542011-01-21 12:35:15 +01003723 if (!signal_pending(current))
3724 dev_warn(DEV, "short read while reading sub header: rv=%d\n", rv);
Lars Ellenbergc13f7e12010-10-29 23:32:01 +02003725 goto err_out;
3726 }
3727 }
3728
Philipp Reisner02918be2010-08-20 14:35:10 +02003729 rv = drbd_cmd_handler[cmd].function(mdev, cmd, packet_size - shs);
3730
3731 if (unlikely(!rv)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003732 dev_err(DEV, "error receiving %s, l: %d!\n",
Philipp Reisner02918be2010-08-20 14:35:10 +02003733 cmdname(cmd), packet_size);
3734 goto err_out;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003735 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07003736 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07003737
Philipp Reisner02918be2010-08-20 14:35:10 +02003738 if (0) {
3739 err_out:
3740 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
Philipp Reisnerb411b362009-09-25 16:07:19 -07003741 }
Lars Ellenberg856c50c2010-10-14 13:37:40 +02003742 /* If we leave here, we probably want to update at least the
3743 * "Connected" indicator on stable storage. Do so explicitly here. */
3744 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003745}
3746
3747void drbd_flush_workqueue(struct drbd_conf *mdev)
3748{
3749 struct drbd_wq_barrier barr;
3750
3751 barr.w.cb = w_prev_work_done;
3752 init_completion(&barr.done);
3753 drbd_queue_work(&mdev->data.work, &barr.w);
3754 wait_for_completion(&barr.done);
3755}
3756
Philipp Reisnerf70b35112010-06-24 14:34:40 +02003757void drbd_free_tl_hash(struct drbd_conf *mdev)
3758{
3759 struct hlist_head *h;
3760
3761 spin_lock_irq(&mdev->req_lock);
3762
3763 if (!mdev->tl_hash || mdev->state.conn != C_STANDALONE) {
3764 spin_unlock_irq(&mdev->req_lock);
3765 return;
3766 }
3767 /* paranoia code */
3768 for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
3769 if (h->first)
3770 dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
3771 (int)(h - mdev->ee_hash), h->first);
3772 kfree(mdev->ee_hash);
3773 mdev->ee_hash = NULL;
3774 mdev->ee_hash_s = 0;
3775
3776 /* paranoia code */
3777 for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
3778 if (h->first)
3779 dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
3780 (int)(h - mdev->tl_hash), h->first);
3781 kfree(mdev->tl_hash);
3782 mdev->tl_hash = NULL;
3783 mdev->tl_hash_s = 0;
3784 spin_unlock_irq(&mdev->req_lock);
3785}
3786
Philipp Reisnerb411b362009-09-25 16:07:19 -07003787static void drbd_disconnect(struct drbd_conf *mdev)
3788{
3789 enum drbd_fencing_p fp;
3790 union drbd_state os, ns;
3791 int rv = SS_UNKNOWN_ERROR;
3792 unsigned int i;
3793
3794 if (mdev->state.conn == C_STANDALONE)
3795 return;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003796
3797 /* asender does not clean up anything. it must not interfere, either */
3798 drbd_thread_stop(&mdev->asender);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003799 drbd_free_sock(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003800
Philipp Reisner85719572010-07-21 10:20:17 +02003801 /* wait for current activity to cease. */
Philipp Reisnerb411b362009-09-25 16:07:19 -07003802 spin_lock_irq(&mdev->req_lock);
3803 _drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
3804 _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee);
3805 _drbd_wait_ee_list_empty(mdev, &mdev->read_ee);
3806 spin_unlock_irq(&mdev->req_lock);
3807
3808 /* We do not have data structures that would allow us to
3809 * get the rs_pending_cnt down to 0 again.
3810 * * On C_SYNC_TARGET we do not have any data structures describing
3811 * the pending RSDataRequest's we have sent.
3812 * * On C_SYNC_SOURCE there is no data structure that tracks
3813 * the P_RS_DATA_REPLY blocks that we sent to the SyncTarget.
3814 * And no, it is not the sum of the reference counts in the
3815 * resync_LRU. The resync_LRU tracks the whole operation including
3816 * the disk-IO, while the rs_pending_cnt only tracks the blocks
3817 * on the fly. */
3818 drbd_rs_cancel_all(mdev);
3819 mdev->rs_total = 0;
3820 mdev->rs_failed = 0;
3821 atomic_set(&mdev->rs_pending_cnt, 0);
3822 wake_up(&mdev->misc_wait);
3823
3824 /* make sure syncer is stopped and w_resume_next_sg queued */
3825 del_timer_sync(&mdev->resync_timer);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003826 resync_timer_fn((unsigned long)mdev);
3827
Philipp Reisnerb411b362009-09-25 16:07:19 -07003828 /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
3829 * w_make_resync_request etc. which may still be on the worker queue
3830 * to be "canceled" */
3831 drbd_flush_workqueue(mdev);
3832
3833 /* This also does reclaim_net_ee(). If we do this too early, we might
3834 * miss some resync ee and pages.*/
3835 drbd_process_done_ee(mdev);
3836
3837 kfree(mdev->p_uuid);
3838 mdev->p_uuid = NULL;
3839
Philipp Reisnerfb22c402010-09-08 23:20:21 +02003840 if (!is_susp(mdev->state))
Philipp Reisnerb411b362009-09-25 16:07:19 -07003841 tl_clear(mdev);
3842
Philipp Reisnerb411b362009-09-25 16:07:19 -07003843 dev_info(DEV, "Connection closed\n");
3844
3845 drbd_md_sync(mdev);
3846
3847 fp = FP_DONT_CARE;
3848 if (get_ldev(mdev)) {
3849 fp = mdev->ldev->dc.fencing;
3850 put_ldev(mdev);
3851 }
3852
Philipp Reisner87f7be42010-06-11 13:56:33 +02003853 if (mdev->state.role == R_PRIMARY && fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN)
3854 drbd_try_outdate_peer_async(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003855
3856 spin_lock_irq(&mdev->req_lock);
3857 os = mdev->state;
3858 if (os.conn >= C_UNCONNECTED) {
3859 /* Do not restart in case we are C_DISCONNECTING */
3860 ns = os;
3861 ns.conn = C_UNCONNECTED;
3862 rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
3863 }
3864 spin_unlock_irq(&mdev->req_lock);
3865
3866 if (os.conn == C_DISCONNECTING) {
Philipp Reisner84dfb9f2010-06-23 11:20:05 +02003867 wait_event(mdev->net_cnt_wait, atomic_read(&mdev->net_cnt) == 0);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003868
Philipp Reisnerb411b362009-09-25 16:07:19 -07003869 crypto_free_hash(mdev->cram_hmac_tfm);
3870 mdev->cram_hmac_tfm = NULL;
3871
3872 kfree(mdev->net_conf);
3873 mdev->net_conf = NULL;
3874 drbd_request_state(mdev, NS(conn, C_STANDALONE));
3875 }
3876
Lars Ellenberg20ceb2b2011-01-21 10:56:44 +01003877 /* serialize with bitmap writeout triggered by the state change,
3878 * if any. */
3879 wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
3880
Philipp Reisnerb411b362009-09-25 16:07:19 -07003881 /* tcp_close and release of sendpage pages can be deferred. I don't
3882 * want to use SO_LINGER, because apparently it can be deferred for
3883 * more than 20 seconds (longest time I checked).
3884 *
3885 * Actually we don't care for exactly when the network stack does its
3886 * put_page(), but release our reference on these pages right here.
3887 */
3888 i = drbd_release_ee(mdev, &mdev->net_ee);
3889 if (i)
3890 dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
Lars Ellenberg435f0742010-09-06 12:30:25 +02003891 i = atomic_read(&mdev->pp_in_use_by_net);
3892 if (i)
3893 dev_info(DEV, "pp_in_use_by_net = %d, expected 0\n", i);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003894 i = atomic_read(&mdev->pp_in_use);
3895 if (i)
Lars Ellenberg45bb9122010-05-14 17:10:48 +02003896 dev_info(DEV, "pp_in_use = %d, expected 0\n", i);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003897
3898 D_ASSERT(list_empty(&mdev->read_ee));
3899 D_ASSERT(list_empty(&mdev->active_ee));
3900 D_ASSERT(list_empty(&mdev->sync_ee));
3901 D_ASSERT(list_empty(&mdev->done_ee));
3902
3903 /* ok, no more ee's on the fly, it is safe to reset the epoch_size */
3904 atomic_set(&mdev->current_epoch->epoch_size, 0);
3905 D_ASSERT(list_empty(&mdev->current_epoch->list));
3906}
3907
3908/*
3909 * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version
3910 * we can agree on is stored in agreed_pro_version.
3911 *
3912 * feature flags and the reserved array should be enough room for future
3913 * enhancements of the handshake protocol, and possible plugins...
3914 *
3915 * for now, they are expected to be zero, but ignored.
3916 */
3917static int drbd_send_handshake(struct drbd_conf *mdev)
3918{
3919 /* ASSERT current == mdev->receiver ... */
3920 struct p_handshake *p = &mdev->data.sbuf.handshake;
3921 int ok;
3922
3923 if (mutex_lock_interruptible(&mdev->data.mutex)) {
3924 dev_err(DEV, "interrupted during initial handshake\n");
3925 return 0; /* interrupted. not ok. */
3926 }
3927
3928 if (mdev->data.socket == NULL) {
3929 mutex_unlock(&mdev->data.mutex);
3930 return 0;
3931 }
3932
3933 memset(p, 0, sizeof(*p));
3934 p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
3935 p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
3936 ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE,
Philipp Reisner0b70a132010-08-20 13:36:10 +02003937 (struct p_header80 *)p, sizeof(*p), 0 );
Philipp Reisnerb411b362009-09-25 16:07:19 -07003938 mutex_unlock(&mdev->data.mutex);
3939 return ok;
3940}
3941
3942/*
3943 * return values:
3944 * 1 yes, we have a valid connection
3945 * 0 oops, did not work out, please try again
3946 * -1 peer talks different language,
3947 * no point in trying again, please go standalone.
3948 */
3949static int drbd_do_handshake(struct drbd_conf *mdev)
3950{
3951 /* ASSERT current == mdev->receiver ... */
3952 struct p_handshake *p = &mdev->data.rbuf.handshake;
Philipp Reisner02918be2010-08-20 14:35:10 +02003953 const int expect = sizeof(struct p_handshake) - sizeof(struct p_header80);
3954 unsigned int length;
3955 enum drbd_packets cmd;
Philipp Reisnerb411b362009-09-25 16:07:19 -07003956 int rv;
3957
3958 rv = drbd_send_handshake(mdev);
3959 if (!rv)
3960 return 0;
3961
Philipp Reisner02918be2010-08-20 14:35:10 +02003962 rv = drbd_recv_header(mdev, &cmd, &length);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003963 if (!rv)
3964 return 0;
3965
Philipp Reisner02918be2010-08-20 14:35:10 +02003966 if (cmd != P_HAND_SHAKE) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003967 dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n",
Philipp Reisner02918be2010-08-20 14:35:10 +02003968 cmdname(cmd), cmd);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003969 return -1;
3970 }
3971
Philipp Reisner02918be2010-08-20 14:35:10 +02003972 if (length != expect) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07003973 dev_err(DEV, "expected HandShake length: %u, received: %u\n",
Philipp Reisner02918be2010-08-20 14:35:10 +02003974 expect, length);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003975 return -1;
3976 }
3977
3978 rv = drbd_recv(mdev, &p->head.payload, expect);
3979
3980 if (rv != expect) {
Lars Ellenberg0ddc5542011-01-21 12:35:15 +01003981 if (!signal_pending(current))
3982 dev_warn(DEV, "short read receiving handshake packet: l=%u\n", rv);
Philipp Reisnerb411b362009-09-25 16:07:19 -07003983 return 0;
3984 }
3985
Philipp Reisnerb411b362009-09-25 16:07:19 -07003986 p->protocol_min = be32_to_cpu(p->protocol_min);
3987 p->protocol_max = be32_to_cpu(p->protocol_max);
3988 if (p->protocol_max == 0)
3989 p->protocol_max = p->protocol_min;
3990
3991 if (PRO_VERSION_MAX < p->protocol_min ||
3992 PRO_VERSION_MIN > p->protocol_max)
3993 goto incompat;
3994
3995 mdev->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
3996
3997 dev_info(DEV, "Handshake successful: "
3998 "Agreed network protocol version %d\n", mdev->agreed_pro_version);
3999
4000 return 1;
4001
4002 incompat:
4003 dev_err(DEV, "incompatible DRBD dialects: "
4004 "I support %d-%d, peer supports %d-%d\n",
4005 PRO_VERSION_MIN, PRO_VERSION_MAX,
4006 p->protocol_min, p->protocol_max);
4007 return -1;
4008}
4009
4010#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
4011static int drbd_do_auth(struct drbd_conf *mdev)
4012{
4013 dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
4014 dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
Johannes Thomab10d96c2010-01-07 16:02:50 +01004015 return -1;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004016}
4017#else
4018#define CHALLENGE_LEN 64
Johannes Thomab10d96c2010-01-07 16:02:50 +01004019
4020/* Return value:
4021 1 - auth succeeded,
4022 0 - failed, try again (network error),
4023 -1 - auth failed, don't try again.
4024*/
4025
Philipp Reisnerb411b362009-09-25 16:07:19 -07004026static int drbd_do_auth(struct drbd_conf *mdev)
4027{
4028 char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */
4029 struct scatterlist sg;
4030 char *response = NULL;
4031 char *right_response = NULL;
4032 char *peers_ch = NULL;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004033 unsigned int key_len = strlen(mdev->net_conf->shared_secret);
4034 unsigned int resp_size;
4035 struct hash_desc desc;
Philipp Reisner02918be2010-08-20 14:35:10 +02004036 enum drbd_packets cmd;
4037 unsigned int length;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004038 int rv;
4039
4040 desc.tfm = mdev->cram_hmac_tfm;
4041 desc.flags = 0;
4042
4043 rv = crypto_hash_setkey(mdev->cram_hmac_tfm,
4044 (u8 *)mdev->net_conf->shared_secret, key_len);
4045 if (rv) {
4046 dev_err(DEV, "crypto_hash_setkey() failed with %d\n", rv);
Johannes Thomab10d96c2010-01-07 16:02:50 +01004047 rv = -1;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004048 goto fail;
4049 }
4050
4051 get_random_bytes(my_challenge, CHALLENGE_LEN);
4052
4053 rv = drbd_send_cmd2(mdev, P_AUTH_CHALLENGE, my_challenge, CHALLENGE_LEN);
4054 if (!rv)
4055 goto fail;
4056
Philipp Reisner02918be2010-08-20 14:35:10 +02004057 rv = drbd_recv_header(mdev, &cmd, &length);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004058 if (!rv)
4059 goto fail;
4060
Philipp Reisner02918be2010-08-20 14:35:10 +02004061 if (cmd != P_AUTH_CHALLENGE) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07004062 dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n",
Philipp Reisner02918be2010-08-20 14:35:10 +02004063 cmdname(cmd), cmd);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004064 rv = 0;
4065 goto fail;
4066 }
4067
Philipp Reisner02918be2010-08-20 14:35:10 +02004068 if (length > CHALLENGE_LEN * 2) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07004069 dev_err(DEV, "expected AuthChallenge payload too big.\n");
Johannes Thomab10d96c2010-01-07 16:02:50 +01004070 rv = -1;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004071 goto fail;
4072 }
4073
Philipp Reisner02918be2010-08-20 14:35:10 +02004074 peers_ch = kmalloc(length, GFP_NOIO);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004075 if (peers_ch == NULL) {
4076 dev_err(DEV, "kmalloc of peers_ch failed\n");
Johannes Thomab10d96c2010-01-07 16:02:50 +01004077 rv = -1;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004078 goto fail;
4079 }
4080
Philipp Reisner02918be2010-08-20 14:35:10 +02004081 rv = drbd_recv(mdev, peers_ch, length);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004082
Philipp Reisner02918be2010-08-20 14:35:10 +02004083 if (rv != length) {
Lars Ellenberg0ddc5542011-01-21 12:35:15 +01004084 if (!signal_pending(current))
4085 dev_warn(DEV, "short read AuthChallenge: l=%u\n", rv);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004086 rv = 0;
4087 goto fail;
4088 }
4089
4090 resp_size = crypto_hash_digestsize(mdev->cram_hmac_tfm);
4091 response = kmalloc(resp_size, GFP_NOIO);
4092 if (response == NULL) {
4093 dev_err(DEV, "kmalloc of response failed\n");
Johannes Thomab10d96c2010-01-07 16:02:50 +01004094 rv = -1;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004095 goto fail;
4096 }
4097
4098 sg_init_table(&sg, 1);
Philipp Reisner02918be2010-08-20 14:35:10 +02004099 sg_set_buf(&sg, peers_ch, length);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004100
4101 rv = crypto_hash_digest(&desc, &sg, sg.length, response);
4102 if (rv) {
4103 dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv);
Johannes Thomab10d96c2010-01-07 16:02:50 +01004104 rv = -1;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004105 goto fail;
4106 }
4107
4108 rv = drbd_send_cmd2(mdev, P_AUTH_RESPONSE, response, resp_size);
4109 if (!rv)
4110 goto fail;
4111
Philipp Reisner02918be2010-08-20 14:35:10 +02004112 rv = drbd_recv_header(mdev, &cmd, &length);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004113 if (!rv)
4114 goto fail;
4115
Philipp Reisner02918be2010-08-20 14:35:10 +02004116 if (cmd != P_AUTH_RESPONSE) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07004117 dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n",
Philipp Reisner02918be2010-08-20 14:35:10 +02004118 cmdname(cmd), cmd);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004119 rv = 0;
4120 goto fail;
4121 }
4122
Philipp Reisner02918be2010-08-20 14:35:10 +02004123 if (length != resp_size) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07004124 dev_err(DEV, "expected AuthResponse payload of wrong size\n");
4125 rv = 0;
4126 goto fail;
4127 }
4128
4129 rv = drbd_recv(mdev, response , resp_size);
4130
4131 if (rv != resp_size) {
Lars Ellenberg0ddc5542011-01-21 12:35:15 +01004132 if (!signal_pending(current))
4133 dev_warn(DEV, "short read receiving AuthResponse: l=%u\n", rv);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004134 rv = 0;
4135 goto fail;
4136 }
4137
4138 right_response = kmalloc(resp_size, GFP_NOIO);
Julia Lawall2d1ee872009-12-27 22:27:11 +01004139 if (right_response == NULL) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07004140 dev_err(DEV, "kmalloc of right_response failed\n");
Johannes Thomab10d96c2010-01-07 16:02:50 +01004141 rv = -1;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004142 goto fail;
4143 }
4144
4145 sg_set_buf(&sg, my_challenge, CHALLENGE_LEN);
4146
4147 rv = crypto_hash_digest(&desc, &sg, sg.length, right_response);
4148 if (rv) {
4149 dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv);
Johannes Thomab10d96c2010-01-07 16:02:50 +01004150 rv = -1;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004151 goto fail;
4152 }
4153
4154 rv = !memcmp(response, right_response, resp_size);
4155
4156 if (rv)
4157 dev_info(DEV, "Peer authenticated using %d bytes of '%s' HMAC\n",
4158 resp_size, mdev->net_conf->cram_hmac_alg);
Johannes Thomab10d96c2010-01-07 16:02:50 +01004159 else
4160 rv = -1;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004161
4162 fail:
4163 kfree(peers_ch);
4164 kfree(response);
4165 kfree(right_response);
4166
4167 return rv;
4168}
4169#endif
4170
4171int drbdd_init(struct drbd_thread *thi)
4172{
4173 struct drbd_conf *mdev = thi->mdev;
4174 unsigned int minor = mdev_to_minor(mdev);
4175 int h;
4176
4177 sprintf(current->comm, "drbd%d_receiver", minor);
4178
4179 dev_info(DEV, "receiver (re)started\n");
4180
4181 do {
4182 h = drbd_connect(mdev);
4183 if (h == 0) {
4184 drbd_disconnect(mdev);
Philipp Reisner20ee6392011-01-18 15:28:59 +01004185 schedule_timeout_interruptible(HZ);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004186 }
4187 if (h == -1) {
4188 dev_warn(DEV, "Discarding network configuration.\n");
4189 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
4190 }
4191 } while (h == 0);
4192
4193 if (h > 0) {
4194 if (get_net_conf(mdev)) {
4195 drbdd(mdev);
4196 put_net_conf(mdev);
4197 }
4198 }
4199
4200 drbd_disconnect(mdev);
4201
4202 dev_info(DEV, "receiver terminated\n");
4203 return 0;
4204}
4205
4206/* ********* acknowledge sender ******** */
4207
Philipp Reisner0b70a132010-08-20 13:36:10 +02004208static int got_RqSReply(struct drbd_conf *mdev, struct p_header80 *h)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004209{
4210 struct p_req_state_reply *p = (struct p_req_state_reply *)h;
4211
4212 int retcode = be32_to_cpu(p->retcode);
4213
4214 if (retcode >= SS_SUCCESS) {
4215 set_bit(CL_ST_CHG_SUCCESS, &mdev->flags);
4216 } else {
4217 set_bit(CL_ST_CHG_FAIL, &mdev->flags);
4218 dev_err(DEV, "Requested state change failed by peer: %s (%d)\n",
4219 drbd_set_st_err_str(retcode), retcode);
4220 }
4221 wake_up(&mdev->state_wait);
4222
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01004223 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004224}
4225
Philipp Reisner0b70a132010-08-20 13:36:10 +02004226static int got_Ping(struct drbd_conf *mdev, struct p_header80 *h)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004227{
4228 return drbd_send_ping_ack(mdev);
4229
4230}
4231
Philipp Reisner0b70a132010-08-20 13:36:10 +02004232static int got_PingAck(struct drbd_conf *mdev, struct p_header80 *h)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004233{
4234 /* restore idle timeout */
4235 mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
Philipp Reisner309d1602010-03-02 15:03:44 +01004236 if (!test_and_set_bit(GOT_PING_ACK, &mdev->flags))
4237 wake_up(&mdev->misc_wait);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004238
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01004239 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004240}
4241
Philipp Reisner0b70a132010-08-20 13:36:10 +02004242static int got_IsInSync(struct drbd_conf *mdev, struct p_header80 *h)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004243{
4244 struct p_block_ack *p = (struct p_block_ack *)h;
4245 sector_t sector = be64_to_cpu(p->sector);
4246 int blksize = be32_to_cpu(p->blksize);
4247
4248 D_ASSERT(mdev->agreed_pro_version >= 89);
4249
4250 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4251
Lars Ellenberg1d53f092010-09-05 01:13:24 +02004252 if (get_ldev(mdev)) {
4253 drbd_rs_complete_io(mdev, sector);
4254 drbd_set_in_sync(mdev, sector, blksize);
4255 /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
4256 mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
4257 put_ldev(mdev);
4258 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07004259 dec_rs_pending(mdev);
Philipp Reisner778f2712010-07-06 11:14:00 +02004260 atomic_add(blksize >> 9, &mdev->rs_sect_in);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004261
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01004262 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004263}
4264
4265/* when we receive the ACK for a write request,
4266 * verify that we actually know about it */
4267static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev,
4268 u64 id, sector_t sector)
4269{
4270 struct hlist_head *slot = tl_hash_slot(mdev, sector);
4271 struct hlist_node *n;
4272 struct drbd_request *req;
4273
4274 hlist_for_each_entry(req, n, slot, colision) {
4275 if ((unsigned long)req == (unsigned long)id) {
4276 if (req->sector != sector) {
4277 dev_err(DEV, "_ack_id_to_req: found req %p but it has "
4278 "wrong sector (%llus versus %llus)\n", req,
4279 (unsigned long long)req->sector,
4280 (unsigned long long)sector);
4281 break;
4282 }
4283 return req;
4284 }
4285 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07004286 return NULL;
4287}
4288
4289typedef struct drbd_request *(req_validator_fn)
4290 (struct drbd_conf *mdev, u64 id, sector_t sector);
4291
4292static int validate_req_change_req_state(struct drbd_conf *mdev,
4293 u64 id, sector_t sector, req_validator_fn validator,
4294 const char *func, enum drbd_req_event what)
4295{
4296 struct drbd_request *req;
4297 struct bio_and_error m;
4298
4299 spin_lock_irq(&mdev->req_lock);
4300 req = validator(mdev, id, sector);
4301 if (unlikely(!req)) {
4302 spin_unlock_irq(&mdev->req_lock);
Philipp Reisner2deb8332011-01-17 18:39:18 +01004303
4304 dev_err(DEV, "%s: failed to find req %p, sector %llus\n", func,
4305 (void *)(unsigned long)id, (unsigned long long)sector);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01004306 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004307 }
4308 __req_mod(req, what, &m);
4309 spin_unlock_irq(&mdev->req_lock);
4310
4311 if (m.bio)
4312 complete_master_bio(mdev, &m);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01004313 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004314}
4315
Philipp Reisner0b70a132010-08-20 13:36:10 +02004316static int got_BlockAck(struct drbd_conf *mdev, struct p_header80 *h)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004317{
4318 struct p_block_ack *p = (struct p_block_ack *)h;
4319 sector_t sector = be64_to_cpu(p->sector);
4320 int blksize = be32_to_cpu(p->blksize);
4321 enum drbd_req_event what;
4322
4323 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4324
4325 if (is_syncer_block_id(p->block_id)) {
4326 drbd_set_in_sync(mdev, sector, blksize);
4327 dec_rs_pending(mdev);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01004328 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004329 }
4330 switch (be16_to_cpu(h->command)) {
4331 case P_RS_WRITE_ACK:
4332 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
4333 what = write_acked_by_peer_and_sis;
4334 break;
4335 case P_WRITE_ACK:
4336 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
4337 what = write_acked_by_peer;
4338 break;
4339 case P_RECV_ACK:
4340 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_B);
4341 what = recv_acked_by_peer;
4342 break;
4343 case P_DISCARD_ACK:
4344 D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
4345 what = conflict_discarded_by_peer;
4346 break;
4347 default:
4348 D_ASSERT(0);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01004349 return false;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004350 }
4351
4352 return validate_req_change_req_state(mdev, p->block_id, sector,
4353 _ack_id_to_req, __func__ , what);
4354}
4355
Philipp Reisner0b70a132010-08-20 13:36:10 +02004356static int got_NegAck(struct drbd_conf *mdev, struct p_header80 *h)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004357{
4358 struct p_block_ack *p = (struct p_block_ack *)h;
4359 sector_t sector = be64_to_cpu(p->sector);
Philipp Reisner2deb8332011-01-17 18:39:18 +01004360 int size = be32_to_cpu(p->blksize);
4361 struct drbd_request *req;
4362 struct bio_and_error m;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004363
4364 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4365
4366 if (is_syncer_block_id(p->block_id)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07004367 dec_rs_pending(mdev);
4368 drbd_rs_failed_io(mdev, sector, size);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01004369 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004370 }
Philipp Reisner2deb8332011-01-17 18:39:18 +01004371
4372 spin_lock_irq(&mdev->req_lock);
4373 req = _ack_id_to_req(mdev, p->block_id, sector);
4374 if (!req) {
4375 spin_unlock_irq(&mdev->req_lock);
4376 if (mdev->net_conf->wire_protocol == DRBD_PROT_A ||
4377 mdev->net_conf->wire_protocol == DRBD_PROT_B) {
4378 /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs.
4379 The master bio might already be completed, therefore the
4380 request is no longer in the collision hash.
4381 => Do not try to validate block_id as request. */
4382 /* In Protocol B we might already have got a P_RECV_ACK
4383 but then get a P_NEG_ACK after wards. */
4384 drbd_set_out_of_sync(mdev, sector, size);
4385 return true;
4386 } else {
4387 dev_err(DEV, "%s: failed to find req %p, sector %llus\n", __func__,
4388 (void *)(unsigned long)p->block_id, (unsigned long long)sector);
4389 return false;
4390 }
4391 }
4392 __req_mod(req, neg_acked, &m);
4393 spin_unlock_irq(&mdev->req_lock);
4394
4395 if (m.bio)
4396 complete_master_bio(mdev, &m);
4397 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004398}
4399
Philipp Reisner0b70a132010-08-20 13:36:10 +02004400static int got_NegDReply(struct drbd_conf *mdev, struct p_header80 *h)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004401{
4402 struct p_block_ack *p = (struct p_block_ack *)h;
4403 sector_t sector = be64_to_cpu(p->sector);
4404
4405 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4406 dev_err(DEV, "Got NegDReply; Sector %llus, len %u; Fail original request.\n",
4407 (unsigned long long)sector, be32_to_cpu(p->blksize));
4408
4409 return validate_req_change_req_state(mdev, p->block_id, sector,
4410 _ar_id_to_req, __func__ , neg_acked);
4411}
4412
Philipp Reisner0b70a132010-08-20 13:36:10 +02004413static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header80 *h)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004414{
4415 sector_t sector;
4416 int size;
4417 struct p_block_ack *p = (struct p_block_ack *)h;
4418
4419 sector = be64_to_cpu(p->sector);
4420 size = be32_to_cpu(p->blksize);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004421
4422 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4423
4424 dec_rs_pending(mdev);
4425
4426 if (get_ldev_if_state(mdev, D_FAILED)) {
4427 drbd_rs_complete_io(mdev, sector);
Philipp Reisnerd612d302010-12-27 10:53:28 +01004428 switch (be16_to_cpu(h->command)) {
4429 case P_NEG_RS_DREPLY:
4430 drbd_rs_failed_io(mdev, sector, size);
4431 case P_RS_CANCEL:
4432 break;
4433 default:
4434 D_ASSERT(0);
4435 put_ldev(mdev);
4436 return false;
4437 }
Philipp Reisnerb411b362009-09-25 16:07:19 -07004438 put_ldev(mdev);
4439 }
4440
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01004441 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004442}
4443
Philipp Reisner0b70a132010-08-20 13:36:10 +02004444static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004445{
4446 struct p_barrier_ack *p = (struct p_barrier_ack *)h;
4447
4448 tl_release(mdev, p->barrier, be32_to_cpu(p->set_size));
4449
Philipp Reisnerc4752ef2010-10-27 17:32:36 +02004450 if (mdev->state.conn == C_AHEAD &&
4451 atomic_read(&mdev->ap_in_flight) == 0 &&
Philipp Reisner370a43e2011-01-14 16:03:11 +01004452 !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->current_epoch->flags)) {
4453 mdev->start_resync_timer.expires = jiffies + HZ;
4454 add_timer(&mdev->start_resync_timer);
Philipp Reisnerc4752ef2010-10-27 17:32:36 +02004455 }
4456
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01004457 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004458}
4459
Philipp Reisner0b70a132010-08-20 13:36:10 +02004460static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h)
Philipp Reisnerb411b362009-09-25 16:07:19 -07004461{
4462 struct p_block_ack *p = (struct p_block_ack *)h;
4463 struct drbd_work *w;
4464 sector_t sector;
4465 int size;
4466
4467 sector = be64_to_cpu(p->sector);
4468 size = be32_to_cpu(p->blksize);
4469
4470 update_peer_seq(mdev, be32_to_cpu(p->seq_num));
4471
4472 if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
4473 drbd_ov_oos_found(mdev, sector, size);
4474 else
4475 ov_oos_print(mdev);
4476
Lars Ellenberg1d53f092010-09-05 01:13:24 +02004477 if (!get_ldev(mdev))
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01004478 return true;
Lars Ellenberg1d53f092010-09-05 01:13:24 +02004479
Philipp Reisnerb411b362009-09-25 16:07:19 -07004480 drbd_rs_complete_io(mdev, sector);
4481 dec_rs_pending(mdev);
4482
Lars Ellenbergea5442a2010-11-05 09:48:01 +01004483 --mdev->ov_left;
4484
4485 /* let's advance progress step marks only for every other megabyte */
4486 if ((mdev->ov_left & 0x200) == 0x200)
4487 drbd_advance_rs_marks(mdev, mdev->ov_left);
4488
4489 if (mdev->ov_left == 0) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07004490 w = kmalloc(sizeof(*w), GFP_NOIO);
4491 if (w) {
4492 w->cb = w_ov_finished;
4493 drbd_queue_work_front(&mdev->data.work, w);
4494 } else {
4495 dev_err(DEV, "kmalloc(w) failed.");
4496 ov_oos_print(mdev);
4497 drbd_resync_finished(mdev);
4498 }
4499 }
Lars Ellenberg1d53f092010-09-05 01:13:24 +02004500 put_ldev(mdev);
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01004501 return true;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004502}
4503
Philipp Reisner02918be2010-08-20 14:35:10 +02004504static int got_skip(struct drbd_conf *mdev, struct p_header80 *h)
Philipp Reisner0ced55a2010-04-30 15:26:20 +02004505{
Andreas Gruenbacher81e84652010-12-09 15:03:57 +01004506 return true;
Philipp Reisner0ced55a2010-04-30 15:26:20 +02004507}
4508
Philipp Reisnerb411b362009-09-25 16:07:19 -07004509struct asender_cmd {
4510 size_t pkt_size;
Philipp Reisner0b70a132010-08-20 13:36:10 +02004511 int (*process)(struct drbd_conf *mdev, struct p_header80 *h);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004512};
4513
4514static struct asender_cmd *get_asender_cmd(int cmd)
4515{
4516 static struct asender_cmd asender_tbl[] = {
4517 /* anything missing from this table is in
4518 * the drbd_cmd_handler (drbd_default_handler) table,
4519 * see the beginning of drbdd() */
Philipp Reisner0b70a132010-08-20 13:36:10 +02004520 [P_PING] = { sizeof(struct p_header80), got_Ping },
4521 [P_PING_ACK] = { sizeof(struct p_header80), got_PingAck },
Philipp Reisnerb411b362009-09-25 16:07:19 -07004522 [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
4523 [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
4524 [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
4525 [P_DISCARD_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
4526 [P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck },
4527 [P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply },
4528 [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply},
4529 [P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult },
4530 [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck },
4531 [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
4532 [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync },
Philipp Reisner02918be2010-08-20 14:35:10 +02004533 [P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_skip },
Philipp Reisnerd612d302010-12-27 10:53:28 +01004534 [P_RS_CANCEL] = { sizeof(struct p_block_ack), got_NegRSDReply},
Philipp Reisnerb411b362009-09-25 16:07:19 -07004535 [P_MAX_CMD] = { 0, NULL },
4536 };
4537 if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
4538 return NULL;
4539 return &asender_tbl[cmd];
4540}
4541
4542int drbd_asender(struct drbd_thread *thi)
4543{
4544 struct drbd_conf *mdev = thi->mdev;
Philipp Reisner02918be2010-08-20 14:35:10 +02004545 struct p_header80 *h = &mdev->meta.rbuf.header.h80;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004546 struct asender_cmd *cmd = NULL;
4547
4548 int rv, len;
4549 void *buf = h;
4550 int received = 0;
Philipp Reisner0b70a132010-08-20 13:36:10 +02004551 int expect = sizeof(struct p_header80);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004552 int empty;
4553
4554 sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev));
4555
4556 current->policy = SCHED_RR; /* Make this a realtime task! */
4557 current->rt_priority = 2; /* more important than all other tasks */
4558
4559 while (get_t_state(thi) == Running) {
4560 drbd_thread_current_set_cpu(mdev);
4561 if (test_and_clear_bit(SEND_PING, &mdev->flags)) {
4562 ERR_IF(!drbd_send_ping(mdev)) goto reconnect;
4563 mdev->meta.socket->sk->sk_rcvtimeo =
4564 mdev->net_conf->ping_timeo*HZ/10;
4565 }
4566
4567 /* conditionally cork;
4568 * it may hurt latency if we cork without much to send */
4569 if (!mdev->net_conf->no_cork &&
4570 3 < atomic_read(&mdev->unacked_cnt))
4571 drbd_tcp_cork(mdev->meta.socket);
4572 while (1) {
4573 clear_bit(SIGNAL_ASENDER, &mdev->flags);
4574 flush_signals(current);
Lars Ellenberg0f8488e2010-10-13 18:19:23 +02004575 if (!drbd_process_done_ee(mdev))
Philipp Reisnerb411b362009-09-25 16:07:19 -07004576 goto reconnect;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004577 /* to avoid race with newly queued ACKs */
4578 set_bit(SIGNAL_ASENDER, &mdev->flags);
4579 spin_lock_irq(&mdev->req_lock);
4580 empty = list_empty(&mdev->done_ee);
4581 spin_unlock_irq(&mdev->req_lock);
4582 /* new ack may have been queued right here,
4583 * but then there is also a signal pending,
4584 * and we start over... */
4585 if (empty)
4586 break;
4587 }
4588 /* but unconditionally uncork unless disabled */
4589 if (!mdev->net_conf->no_cork)
4590 drbd_tcp_uncork(mdev->meta.socket);
4591
4592 /* short circuit, recv_msg would return EINTR anyways. */
4593 if (signal_pending(current))
4594 continue;
4595
4596 rv = drbd_recv_short(mdev, mdev->meta.socket,
4597 buf, expect-received, 0);
4598 clear_bit(SIGNAL_ASENDER, &mdev->flags);
4599
4600 flush_signals(current);
4601
4602 /* Note:
4603 * -EINTR (on meta) we got a signal
4604 * -EAGAIN (on meta) rcvtimeo expired
4605 * -ECONNRESET other side closed the connection
4606 * -ERESTARTSYS (on data) we got a signal
4607 * rv < 0 other than above: unexpected error!
4608 * rv == expected: full header or command
4609 * rv < expected: "woken" by signal during receive
4610 * rv == 0 : "connection shut down by peer"
4611 */
4612 if (likely(rv > 0)) {
4613 received += rv;
4614 buf += rv;
4615 } else if (rv == 0) {
4616 dev_err(DEV, "meta connection shut down by peer.\n");
4617 goto reconnect;
4618 } else if (rv == -EAGAIN) {
4619 if (mdev->meta.socket->sk->sk_rcvtimeo ==
4620 mdev->net_conf->ping_timeo*HZ/10) {
4621 dev_err(DEV, "PingAck did not arrive in time.\n");
4622 goto reconnect;
4623 }
4624 set_bit(SEND_PING, &mdev->flags);
4625 continue;
4626 } else if (rv == -EINTR) {
4627 continue;
4628 } else {
4629 dev_err(DEV, "sock_recvmsg returned %d\n", rv);
4630 goto reconnect;
4631 }
4632
4633 if (received == expect && cmd == NULL) {
4634 if (unlikely(h->magic != BE_DRBD_MAGIC)) {
Lars Ellenberg004352f2010-10-05 20:13:58 +02004635 dev_err(DEV, "magic?? on meta m: 0x%08x c: %d l: %d\n",
4636 be32_to_cpu(h->magic),
4637 be16_to_cpu(h->command),
4638 be16_to_cpu(h->length));
Philipp Reisnerb411b362009-09-25 16:07:19 -07004639 goto reconnect;
4640 }
4641 cmd = get_asender_cmd(be16_to_cpu(h->command));
4642 len = be16_to_cpu(h->length);
4643 if (unlikely(cmd == NULL)) {
Lars Ellenberg004352f2010-10-05 20:13:58 +02004644 dev_err(DEV, "unknown command?? on meta m: 0x%08x c: %d l: %d\n",
4645 be32_to_cpu(h->magic),
4646 be16_to_cpu(h->command),
4647 be16_to_cpu(h->length));
Philipp Reisnerb411b362009-09-25 16:07:19 -07004648 goto disconnect;
4649 }
4650 expect = cmd->pkt_size;
Philipp Reisner0b70a132010-08-20 13:36:10 +02004651 ERR_IF(len != expect-sizeof(struct p_header80))
Philipp Reisnerb411b362009-09-25 16:07:19 -07004652 goto reconnect;
Philipp Reisnerb411b362009-09-25 16:07:19 -07004653 }
4654 if (received == expect) {
4655 D_ASSERT(cmd != NULL);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004656 if (!cmd->process(mdev, h))
4657 goto reconnect;
4658
4659 buf = h;
4660 received = 0;
Philipp Reisner0b70a132010-08-20 13:36:10 +02004661 expect = sizeof(struct p_header80);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004662 cmd = NULL;
4663 }
4664 }
4665
4666 if (0) {
4667reconnect:
4668 drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
Lars Ellenberg856c50c2010-10-14 13:37:40 +02004669 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004670 }
4671 if (0) {
4672disconnect:
4673 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
Lars Ellenberg856c50c2010-10-14 13:37:40 +02004674 drbd_md_sync(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07004675 }
4676 clear_bit(SIGNAL_ASENDER, &mdev->flags);
4677
4678 D_ASSERT(mdev->state.conn < C_CONNECTED);
4679 dev_info(DEV, "asender terminated\n");
4680
4681 return 0;
4682}