blob: 727ff6339754776c389f717ca98276bbdd72714e [file] [log] [blame]
Philipp Reisnerb411b362009-09-25 16:07:19 -07001/*
2 drbd_worker.c
3
4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
5
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
9
10 drbd is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2, or (at your option)
13 any later version.
14
15 drbd is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with drbd; see the file COPYING. If not, write to
22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
23
24 */
25
Philipp Reisnerb411b362009-09-25 16:07:19 -070026#include <linux/module.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070027#include <linux/drbd.h>
28#include <linux/sched.h>
29#include <linux/smp_lock.h>
30#include <linux/wait.h>
31#include <linux/mm.h>
32#include <linux/memcontrol.h>
33#include <linux/mm_inline.h>
34#include <linux/slab.h>
35#include <linux/random.h>
Philipp Reisnerb411b362009-09-25 16:07:19 -070036#include <linux/string.h>
37#include <linux/scatterlist.h>
38
39#include "drbd_int.h"
40#include "drbd_req.h"
Philipp Reisnerb411b362009-09-25 16:07:19 -070041
42#define SLEEP_TIME (HZ/10)
43
44static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel);
45
46
47
48/* defined here:
49 drbd_md_io_complete
Lars Ellenberg45bb9122010-05-14 17:10:48 +020050 drbd_endio_sec
Philipp Reisnerb411b362009-09-25 16:07:19 -070051 drbd_endio_pri
52
53 * more endio handlers:
54 atodb_endio in drbd_actlog.c
55 drbd_bm_async_io_complete in drbd_bitmap.c
56
57 * For all these callbacks, note the following:
58 * The callbacks will be called in irq context by the IDE drivers,
59 * and in Softirqs/Tasklets/BH context by the SCSI drivers.
60 * Try to get the locking right :)
61 *
62 */
63
64
65/* About the global_state_lock
66 Each state transition on an device holds a read lock. In case we have
67 to evaluate the sync after dependencies, we grab a write lock, because
68 we need stable states on all devices for that. */
69rwlock_t global_state_lock;
70
71/* used for synchronous meta data and bitmap IO
72 * submitted by drbd_md_sync_page_io()
73 */
74void drbd_md_io_complete(struct bio *bio, int error)
75{
76 struct drbd_md_io *md_io;
77
78 md_io = (struct drbd_md_io *)bio->bi_private;
79 md_io->error = error;
80
Philipp Reisnerb411b362009-09-25 16:07:19 -070081 complete(&md_io->event);
82}
83
84/* reads on behalf of the partner,
85 * "submitted" by the receiver
86 */
Lars Ellenberg45bb9122010-05-14 17:10:48 +020087void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local)
Philipp Reisnerb411b362009-09-25 16:07:19 -070088{
89 unsigned long flags = 0;
Lars Ellenberg45bb9122010-05-14 17:10:48 +020090 struct drbd_conf *mdev = e->mdev;
Philipp Reisnerb411b362009-09-25 16:07:19 -070091
92 D_ASSERT(e->block_id != ID_VACANT);
93
Philipp Reisnerb411b362009-09-25 16:07:19 -070094 spin_lock_irqsave(&mdev->req_lock, flags);
95 mdev->read_cnt += e->size >> 9;
96 list_del(&e->w.list);
97 if (list_empty(&mdev->read_ee))
98 wake_up(&mdev->ee_wait);
Lars Ellenberg45bb9122010-05-14 17:10:48 +020099 if (test_bit(__EE_WAS_ERROR, &e->flags))
100 __drbd_chk_io_error(mdev, FALSE);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700101 spin_unlock_irqrestore(&mdev->req_lock, flags);
102
Philipp Reisnerb411b362009-09-25 16:07:19 -0700103 drbd_queue_work(&mdev->data.work, &e->w);
104 put_ldev(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700105}
106
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200107static int is_failed_barrier(int ee_flags)
108{
109 return (ee_flags & (EE_IS_BARRIER|EE_WAS_ERROR|EE_RESUBMITTED))
110 == (EE_IS_BARRIER|EE_WAS_ERROR);
111}
112
Philipp Reisnerb411b362009-09-25 16:07:19 -0700113/* writes on behalf of the partner, or resync writes,
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200114 * "submitted" by the receiver, final stage. */
115static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(local)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700116{
117 unsigned long flags = 0;
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200118 struct drbd_conf *mdev = e->mdev;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700119 sector_t e_sector;
120 int do_wake;
121 int is_syncer_req;
122 int do_al_complete_io;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700123
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200124 /* if this is a failed barrier request, disable use of barriers,
125 * and schedule for resubmission */
126 if (is_failed_barrier(e->flags)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700127 drbd_bump_write_ordering(mdev, WO_bdev_flush);
128 spin_lock_irqsave(&mdev->req_lock, flags);
129 list_del(&e->w.list);
Philipp Reisnerfc8ce192010-05-20 10:04:17 +0200130 e->flags = (e->flags & ~EE_WAS_ERROR) | EE_RESUBMITTED;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700131 e->w.cb = w_e_reissue;
132 /* put_ldev actually happens below, once we come here again. */
133 __release(local);
134 spin_unlock_irqrestore(&mdev->req_lock, flags);
135 drbd_queue_work(&mdev->data.work, &e->w);
136 return;
137 }
138
139 D_ASSERT(e->block_id != ID_VACANT);
140
Philipp Reisnerb411b362009-09-25 16:07:19 -0700141 /* after we moved e to done_ee,
142 * we may no longer access it,
143 * it may be freed/reused already!
144 * (as soon as we release the req_lock) */
145 e_sector = e->sector;
146 do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO;
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200147 is_syncer_req = is_syncer_block_id(e->block_id);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700148
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200149 spin_lock_irqsave(&mdev->req_lock, flags);
150 mdev->writ_cnt += e->size >> 9;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700151 list_del(&e->w.list); /* has been on active_ee or sync_ee */
152 list_add_tail(&e->w.list, &mdev->done_ee);
153
Philipp Reisnerb411b362009-09-25 16:07:19 -0700154 /* No hlist_del_init(&e->colision) here, we did not send the Ack yet,
155 * neither did we wake possibly waiting conflicting requests.
156 * done from "drbd_process_done_ee" within the appropriate w.cb
157 * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */
158
159 do_wake = is_syncer_req
160 ? list_empty(&mdev->sync_ee)
161 : list_empty(&mdev->active_ee);
162
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200163 if (test_bit(__EE_WAS_ERROR, &e->flags))
Philipp Reisnerb411b362009-09-25 16:07:19 -0700164 __drbd_chk_io_error(mdev, FALSE);
165 spin_unlock_irqrestore(&mdev->req_lock, flags);
166
167 if (is_syncer_req)
168 drbd_rs_complete_io(mdev, e_sector);
169
170 if (do_wake)
171 wake_up(&mdev->ee_wait);
172
173 if (do_al_complete_io)
174 drbd_al_complete_io(mdev, e_sector);
175
176 wake_asender(mdev);
177 put_ldev(mdev);
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200178}
Philipp Reisnerb411b362009-09-25 16:07:19 -0700179
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200180/* writes on behalf of the partner, or resync writes,
181 * "submitted" by the receiver.
182 */
183void drbd_endio_sec(struct bio *bio, int error)
184{
185 struct drbd_epoch_entry *e = bio->bi_private;
186 struct drbd_conf *mdev = e->mdev;
187 int uptodate = bio_flagged(bio, BIO_UPTODATE);
188 int is_write = bio_data_dir(bio) == WRITE;
189
190 if (error)
191 dev_warn(DEV, "%s: error=%d s=%llus\n",
192 is_write ? "write" : "read", error,
193 (unsigned long long)e->sector);
194 if (!error && !uptodate) {
195 dev_warn(DEV, "%s: setting error to -EIO s=%llus\n",
196 is_write ? "write" : "read",
197 (unsigned long long)e->sector);
198 /* strange behavior of some lower level drivers...
199 * fail the request by clearing the uptodate flag,
200 * but do not return any error?! */
201 error = -EIO;
202 }
203
204 if (error)
205 set_bit(__EE_WAS_ERROR, &e->flags);
206
207 bio_put(bio); /* no need for the bio anymore */
208 if (atomic_dec_and_test(&e->pending_bios)) {
209 if (is_write)
210 drbd_endio_write_sec_final(e);
211 else
212 drbd_endio_read_sec_final(e);
213 }
Philipp Reisnerb411b362009-09-25 16:07:19 -0700214}
215
216/* read, readA or write requests on R_PRIMARY coming from drbd_make_request
217 */
218void drbd_endio_pri(struct bio *bio, int error)
219{
220 unsigned long flags;
221 struct drbd_request *req = bio->bi_private;
222 struct drbd_conf *mdev = req->mdev;
223 struct bio_and_error m;
224 enum drbd_req_event what;
225 int uptodate = bio_flagged(bio, BIO_UPTODATE);
226
227 if (error)
228 dev_warn(DEV, "p %s: error=%d\n",
229 bio_data_dir(bio) == WRITE ? "write" : "read", error);
230 if (!error && !uptodate) {
231 dev_warn(DEV, "p %s: setting error to -EIO\n",
232 bio_data_dir(bio) == WRITE ? "write" : "read");
233 /* strange behavior of some lower level drivers...
234 * fail the request by clearing the uptodate flag,
235 * but do not return any error?! */
236 error = -EIO;
237 }
238
Philipp Reisnerb411b362009-09-25 16:07:19 -0700239 /* to avoid recursion in __req_mod */
240 if (unlikely(error)) {
241 what = (bio_data_dir(bio) == WRITE)
242 ? write_completed_with_error
Lars Ellenberg5c3c7e62010-04-10 02:10:09 +0200243 : (bio_rw(bio) == READ)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700244 ? read_completed_with_error
245 : read_ahead_completed_with_error;
246 } else
247 what = completed_ok;
248
249 bio_put(req->private_bio);
250 req->private_bio = ERR_PTR(error);
251
252 spin_lock_irqsave(&mdev->req_lock, flags);
253 __req_mod(req, what, &m);
254 spin_unlock_irqrestore(&mdev->req_lock, flags);
255
256 if (m.bio)
257 complete_master_bio(mdev, &m);
258}
259
260int w_io_error(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
261{
262 struct drbd_request *req = container_of(w, struct drbd_request, w);
263
264 /* NOTE: mdev->ldev can be NULL by the time we get here! */
265 /* D_ASSERT(mdev->ldev->dc.on_io_error != EP_PASS_ON); */
266
267 /* the only way this callback is scheduled is from _req_may_be_done,
268 * when it is done and had a local write error, see comments there */
269 drbd_req_free(req);
270
271 return TRUE;
272}
273
274int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
275{
276 struct drbd_request *req = container_of(w, struct drbd_request, w);
277
278 /* We should not detach for read io-error,
279 * but try to WRITE the P_DATA_REPLY to the failed location,
280 * to give the disk the chance to relocate that block */
281
282 spin_lock_irq(&mdev->req_lock);
283 if (cancel ||
284 mdev->state.conn < C_CONNECTED ||
285 mdev->state.pdsk <= D_INCONSISTENT) {
286 _req_mod(req, send_canceled);
287 spin_unlock_irq(&mdev->req_lock);
288 dev_alert(DEV, "WE ARE LOST. Local IO failure, no peer.\n");
289 return 1;
290 }
291 spin_unlock_irq(&mdev->req_lock);
292
293 return w_send_read_req(mdev, w, 0);
294}
295
296int w_resync_inactive(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
297{
298 ERR_IF(cancel) return 1;
299 dev_err(DEV, "resync inactive, but callback triggered??\n");
300 return 1; /* Simply ignore this! */
301}
302
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200303void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_epoch_entry *e, void *digest)
304{
305 struct hash_desc desc;
306 struct scatterlist sg;
307 struct page *page = e->pages;
308 struct page *tmp;
309 unsigned len;
310
311 desc.tfm = tfm;
312 desc.flags = 0;
313
314 sg_init_table(&sg, 1);
315 crypto_hash_init(&desc);
316
317 while ((tmp = page_chain_next(page))) {
318 /* all but the last page will be fully used */
319 sg_set_page(&sg, page, PAGE_SIZE, 0);
320 crypto_hash_update(&desc, &sg, sg.length);
321 page = tmp;
322 }
323 /* and now the last, possibly only partially used page */
324 len = e->size & (PAGE_SIZE - 1);
325 sg_set_page(&sg, page, len ?: PAGE_SIZE, 0);
326 crypto_hash_update(&desc, &sg, sg.length);
327 crypto_hash_final(&desc, digest);
328}
329
330void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest)
Philipp Reisnerb411b362009-09-25 16:07:19 -0700331{
332 struct hash_desc desc;
333 struct scatterlist sg;
334 struct bio_vec *bvec;
335 int i;
336
337 desc.tfm = tfm;
338 desc.flags = 0;
339
340 sg_init_table(&sg, 1);
341 crypto_hash_init(&desc);
342
343 __bio_for_each_segment(bvec, bio, i, 0) {
344 sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset);
345 crypto_hash_update(&desc, &sg, sg.length);
346 }
347 crypto_hash_final(&desc, digest);
348}
349
350static int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
351{
352 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
353 int digest_size;
354 void *digest;
355 int ok;
356
357 D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef);
358
359 if (unlikely(cancel)) {
360 drbd_free_ee(mdev, e);
361 return 1;
362 }
363
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200364 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700365 digest_size = crypto_hash_digestsize(mdev->csums_tfm);
366 digest = kmalloc(digest_size, GFP_NOIO);
367 if (digest) {
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200368 drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700369
370 inc_rs_pending(mdev);
371 ok = drbd_send_drequest_csum(mdev,
372 e->sector,
373 e->size,
374 digest,
375 digest_size,
376 P_CSUM_RS_REQUEST);
377 kfree(digest);
378 } else {
379 dev_err(DEV, "kmalloc() of digest failed.\n");
380 ok = 0;
381 }
382 } else
383 ok = 1;
384
385 drbd_free_ee(mdev, e);
386
387 if (unlikely(!ok))
388 dev_err(DEV, "drbd_send_drequest(..., csum) failed\n");
389 return ok;
390}
391
392#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
393
394static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
395{
396 struct drbd_epoch_entry *e;
397
398 if (!get_ldev(mdev))
399 return 0;
400
401 /* GFP_TRY, because if there is no memory available right now, this may
402 * be rescheduled for later. It is "only" background resync, after all. */
403 e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200404 if (!e)
405 goto fail;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700406
407 spin_lock_irq(&mdev->req_lock);
408 list_add(&e->w.list, &mdev->read_ee);
409 spin_unlock_irq(&mdev->req_lock);
410
Philipp Reisnerb411b362009-09-25 16:07:19 -0700411 e->w.cb = w_e_send_csum;
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200412 if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0)
413 return 1;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700414
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200415 drbd_free_ee(mdev, e);
416fail:
417 put_ldev(mdev);
418 return 2;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700419}
420
421void resync_timer_fn(unsigned long data)
422{
423 unsigned long flags;
424 struct drbd_conf *mdev = (struct drbd_conf *) data;
425 int queue;
426
427 spin_lock_irqsave(&mdev->req_lock, flags);
428
429 if (likely(!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))) {
430 queue = 1;
431 if (mdev->state.conn == C_VERIFY_S)
432 mdev->resync_work.cb = w_make_ov_request;
433 else
434 mdev->resync_work.cb = w_make_resync_request;
435 } else {
436 queue = 0;
437 mdev->resync_work.cb = w_resync_inactive;
438 }
439
440 spin_unlock_irqrestore(&mdev->req_lock, flags);
441
442 /* harmless race: list_empty outside data.work.q_lock */
443 if (list_empty(&mdev->resync_work.list) && queue)
444 drbd_queue_work(&mdev->data.work, &mdev->resync_work);
445}
446
Philipp Reisnercdd67a72010-05-04 16:57:18 +0200447static int calc_resync_rate(struct drbd_conf *mdev)
448{
449 int d = mdev->data_delay / 1000; /* us -> ms */
450 int td = mdev->sync_conf.throttle_th * 100; /* 0.1s -> ms */
451 int hd = mdev->sync_conf.hold_off_th * 100; /* 0.1s -> ms */
452 int cr = mdev->sync_conf.rate;
453
454 return d <= td ? cr :
455 d >= hd ? 0 :
456 cr + (cr * (td - d) / (hd - td));
457}
458
Philipp Reisnerb411b362009-09-25 16:07:19 -0700459int w_make_resync_request(struct drbd_conf *mdev,
460 struct drbd_work *w, int cancel)
461{
462 unsigned long bit;
463 sector_t sector;
464 const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
Lars Ellenbergbb3d0002010-05-14 19:08:55 +0200465 int max_segment_size;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700466 int number, i, size, pe, mx;
467 int align, queued, sndbuf;
468
469 if (unlikely(cancel))
470 return 1;
471
472 if (unlikely(mdev->state.conn < C_CONNECTED)) {
473 dev_err(DEV, "Confused in w_make_resync_request()! cstate < Connected");
474 return 0;
475 }
476
477 if (mdev->state.conn != C_SYNC_TARGET)
478 dev_err(DEV, "%s in w_make_resync_request\n",
479 drbd_conn_str(mdev->state.conn));
480
481 if (!get_ldev(mdev)) {
482 /* Since we only need to access mdev->rsync a
483 get_ldev_if_state(mdev,D_FAILED) would be sufficient, but
484 to continue resync with a broken disk makes no sense at
485 all */
486 dev_err(DEV, "Disk broke down during resync!\n");
487 mdev->resync_work.cb = w_resync_inactive;
488 return 1;
489 }
490
Lars Ellenbergbb3d0002010-05-14 19:08:55 +0200491 /* starting with drbd 8.3.8, we can handle multi-bio EEs,
492 * if it should be necessary */
493 max_segment_size = mdev->agreed_pro_version < 94 ?
494 queue_max_segment_size(mdev->rq_queue) : DRBD_MAX_SEGMENT_SIZE;
495
Philipp Reisnercdd67a72010-05-04 16:57:18 +0200496 mdev->c_sync_rate = calc_resync_rate(mdev);
497 number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700498 pe = atomic_read(&mdev->rs_pending_cnt);
499
500 mutex_lock(&mdev->data.mutex);
501 if (mdev->data.socket)
502 mx = mdev->data.socket->sk->sk_rcvbuf / sizeof(struct p_block_req);
503 else
504 mx = 1;
505 mutex_unlock(&mdev->data.mutex);
506
507 /* For resync rates >160MB/sec, allow more pending RS requests */
508 if (number > mx)
509 mx = number;
510
511 /* Limit the number of pending RS requests to no more than the peer's receive buffer */
512 if ((pe + number) > mx) {
513 number = mx - pe;
514 }
515
516 for (i = 0; i < number; i++) {
517 /* Stop generating RS requests, when half of the send buffer is filled */
518 mutex_lock(&mdev->data.mutex);
519 if (mdev->data.socket) {
520 queued = mdev->data.socket->sk->sk_wmem_queued;
521 sndbuf = mdev->data.socket->sk->sk_sndbuf;
522 } else {
523 queued = 1;
524 sndbuf = 0;
525 }
526 mutex_unlock(&mdev->data.mutex);
527 if (queued > sndbuf / 2)
528 goto requeue;
529
530next_sector:
531 size = BM_BLOCK_SIZE;
532 bit = drbd_bm_find_next(mdev, mdev->bm_resync_fo);
533
534 if (bit == -1UL) {
535 mdev->bm_resync_fo = drbd_bm_bits(mdev);
536 mdev->resync_work.cb = w_resync_inactive;
537 put_ldev(mdev);
538 return 1;
539 }
540
541 sector = BM_BIT_TO_SECT(bit);
542
543 if (drbd_try_rs_begin_io(mdev, sector)) {
544 mdev->bm_resync_fo = bit;
545 goto requeue;
546 }
547 mdev->bm_resync_fo = bit + 1;
548
549 if (unlikely(drbd_bm_test_bit(mdev, bit) == 0)) {
550 drbd_rs_complete_io(mdev, sector);
551 goto next_sector;
552 }
553
554#if DRBD_MAX_SEGMENT_SIZE > BM_BLOCK_SIZE
555 /* try to find some adjacent bits.
556 * we stop if we have already the maximum req size.
557 *
558 * Additionally always align bigger requests, in order to
559 * be prepared for all stripe sizes of software RAIDs.
Philipp Reisnerb411b362009-09-25 16:07:19 -0700560 */
561 align = 1;
562 for (;;) {
563 if (size + BM_BLOCK_SIZE > max_segment_size)
564 break;
565
566 /* Be always aligned */
567 if (sector & ((1<<(align+3))-1))
568 break;
569
570 /* do not cross extent boundaries */
571 if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
572 break;
573 /* now, is it actually dirty, after all?
574 * caution, drbd_bm_test_bit is tri-state for some
575 * obscure reason; ( b == 0 ) would get the out-of-band
576 * only accidentally right because of the "oddly sized"
577 * adjustment below */
578 if (drbd_bm_test_bit(mdev, bit+1) != 1)
579 break;
580 bit++;
581 size += BM_BLOCK_SIZE;
582 if ((BM_BLOCK_SIZE << align) <= size)
583 align++;
584 i++;
585 }
586 /* if we merged some,
587 * reset the offset to start the next drbd_bm_find_next from */
588 if (size > BM_BLOCK_SIZE)
589 mdev->bm_resync_fo = bit + 1;
590#endif
591
592 /* adjust very last sectors, in case we are oddly sized */
593 if (sector + (size>>9) > capacity)
594 size = (capacity-sector)<<9;
595 if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) {
596 switch (read_for_csum(mdev, sector, size)) {
597 case 0: /* Disk failure*/
598 put_ldev(mdev);
599 return 0;
600 case 2: /* Allocation failed */
601 drbd_rs_complete_io(mdev, sector);
602 mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
603 goto requeue;
604 /* case 1: everything ok */
605 }
606 } else {
607 inc_rs_pending(mdev);
608 if (!drbd_send_drequest(mdev, P_RS_DATA_REQUEST,
609 sector, size, ID_SYNCER)) {
610 dev_err(DEV, "drbd_send_drequest() failed, aborting...\n");
611 dec_rs_pending(mdev);
612 put_ldev(mdev);
613 return 0;
614 }
615 }
616 }
617
618 if (mdev->bm_resync_fo >= drbd_bm_bits(mdev)) {
619 /* last syncer _request_ was sent,
620 * but the P_RS_DATA_REPLY not yet received. sync will end (and
621 * next sync group will resume), as soon as we receive the last
622 * resync data block, and the last bit is cleared.
623 * until then resync "work" is "inactive" ...
624 */
625 mdev->resync_work.cb = w_resync_inactive;
626 put_ldev(mdev);
627 return 1;
628 }
629
630 requeue:
631 mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
632 put_ldev(mdev);
633 return 1;
634}
635
636static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
637{
638 int number, i, size;
639 sector_t sector;
640 const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
641
642 if (unlikely(cancel))
643 return 1;
644
645 if (unlikely(mdev->state.conn < C_CONNECTED)) {
646 dev_err(DEV, "Confused in w_make_ov_request()! cstate < Connected");
647 return 0;
648 }
649
650 number = SLEEP_TIME*mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ);
651 if (atomic_read(&mdev->rs_pending_cnt) > number)
652 goto requeue;
653
654 number -= atomic_read(&mdev->rs_pending_cnt);
655
656 sector = mdev->ov_position;
657 for (i = 0; i < number; i++) {
658 if (sector >= capacity) {
659 mdev->resync_work.cb = w_resync_inactive;
660 return 1;
661 }
662
663 size = BM_BLOCK_SIZE;
664
665 if (drbd_try_rs_begin_io(mdev, sector)) {
666 mdev->ov_position = sector;
667 goto requeue;
668 }
669
670 if (sector + (size>>9) > capacity)
671 size = (capacity-sector)<<9;
672
673 inc_rs_pending(mdev);
674 if (!drbd_send_ov_request(mdev, sector, size)) {
675 dec_rs_pending(mdev);
676 return 0;
677 }
678 sector += BM_SECT_PER_BIT;
679 }
680 mdev->ov_position = sector;
681
682 requeue:
683 mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
684 return 1;
685}
686
687
688int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
689{
690 kfree(w);
691 ov_oos_print(mdev);
692 drbd_resync_finished(mdev);
693
694 return 1;
695}
696
697static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
698{
699 kfree(w);
700
701 drbd_resync_finished(mdev);
702
703 return 1;
704}
705
706int drbd_resync_finished(struct drbd_conf *mdev)
707{
708 unsigned long db, dt, dbdt;
709 unsigned long n_oos;
710 union drbd_state os, ns;
711 struct drbd_work *w;
712 char *khelper_cmd = NULL;
713
714 /* Remove all elements from the resync LRU. Since future actions
715 * might set bits in the (main) bitmap, then the entries in the
716 * resync LRU would be wrong. */
717 if (drbd_rs_del_all(mdev)) {
718 /* In case this is not possible now, most probably because
719 * there are P_RS_DATA_REPLY Packets lingering on the worker's
720 * queue (or even the read operations for those packets
721 * is not finished by now). Retry in 100ms. */
722
723 drbd_kick_lo(mdev);
724 __set_current_state(TASK_INTERRUPTIBLE);
725 schedule_timeout(HZ / 10);
726 w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC);
727 if (w) {
728 w->cb = w_resync_finished;
729 drbd_queue_work(&mdev->data.work, w);
730 return 1;
731 }
732 dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n");
733 }
734
735 dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
736 if (dt <= 0)
737 dt = 1;
738 db = mdev->rs_total;
739 dbdt = Bit2KB(db/dt);
740 mdev->rs_paused /= HZ;
741
742 if (!get_ldev(mdev))
743 goto out;
744
745 spin_lock_irq(&mdev->req_lock);
746 os = mdev->state;
747
748 /* This protects us against multiple calls (that can happen in the presence
749 of application IO), and against connectivity loss just before we arrive here. */
750 if (os.conn <= C_CONNECTED)
751 goto out_unlock;
752
753 ns = os;
754 ns.conn = C_CONNECTED;
755
756 dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
757 (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) ?
758 "Online verify " : "Resync",
759 dt + mdev->rs_paused, mdev->rs_paused, dbdt);
760
761 n_oos = drbd_bm_total_weight(mdev);
762
763 if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) {
764 if (n_oos) {
765 dev_alert(DEV, "Online verify found %lu %dk block out of sync!\n",
766 n_oos, Bit2KB(1));
767 khelper_cmd = "out-of-sync";
768 }
769 } else {
770 D_ASSERT((n_oos - mdev->rs_failed) == 0);
771
772 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
773 khelper_cmd = "after-resync-target";
774
775 if (mdev->csums_tfm && mdev->rs_total) {
776 const unsigned long s = mdev->rs_same_csum;
777 const unsigned long t = mdev->rs_total;
778 const int ratio =
779 (t == 0) ? 0 :
780 (t < 100000) ? ((s*100)/t) : (s/(t/100));
781 dev_info(DEV, "%u %% had equal check sums, eliminated: %luK; "
782 "transferred %luK total %luK\n",
783 ratio,
784 Bit2KB(mdev->rs_same_csum),
785 Bit2KB(mdev->rs_total - mdev->rs_same_csum),
786 Bit2KB(mdev->rs_total));
787 }
788 }
789
790 if (mdev->rs_failed) {
791 dev_info(DEV, " %lu failed blocks\n", mdev->rs_failed);
792
793 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
794 ns.disk = D_INCONSISTENT;
795 ns.pdsk = D_UP_TO_DATE;
796 } else {
797 ns.disk = D_UP_TO_DATE;
798 ns.pdsk = D_INCONSISTENT;
799 }
800 } else {
801 ns.disk = D_UP_TO_DATE;
802 ns.pdsk = D_UP_TO_DATE;
803
804 if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
805 if (mdev->p_uuid) {
806 int i;
807 for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++)
808 _drbd_uuid_set(mdev, i, mdev->p_uuid[i]);
809 drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_CURRENT]);
810 _drbd_uuid_set(mdev, UI_CURRENT, mdev->p_uuid[UI_CURRENT]);
811 } else {
812 dev_err(DEV, "mdev->p_uuid is NULL! BUG\n");
813 }
814 }
815
816 drbd_uuid_set_bm(mdev, 0UL);
817
818 if (mdev->p_uuid) {
819 /* Now the two UUID sets are equal, update what we
820 * know of the peer. */
821 int i;
822 for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
823 mdev->p_uuid[i] = mdev->ldev->md.uuid[i];
824 }
825 }
826
827 _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
828out_unlock:
829 spin_unlock_irq(&mdev->req_lock);
830 put_ldev(mdev);
831out:
832 mdev->rs_total = 0;
833 mdev->rs_failed = 0;
834 mdev->rs_paused = 0;
835 mdev->ov_start_sector = 0;
836
837 if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) {
838 dev_warn(DEV, "Writing the whole bitmap, due to failed kmalloc\n");
839 drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished");
840 }
841
842 if (khelper_cmd)
843 drbd_khelper(mdev, khelper_cmd);
844
845 return 1;
846}
847
848/* helper */
849static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
850{
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200851 if (drbd_ee_has_active_page(e)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700852 /* This might happen if sendpage() has not finished */
853 spin_lock_irq(&mdev->req_lock);
854 list_add_tail(&e->w.list, &mdev->net_ee);
855 spin_unlock_irq(&mdev->req_lock);
856 } else
857 drbd_free_ee(mdev, e);
858}
859
860/**
861 * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
862 * @mdev: DRBD device.
863 * @w: work object.
864 * @cancel: The connection will be closed anyways
865 */
866int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
867{
868 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
869 int ok;
870
871 if (unlikely(cancel)) {
872 drbd_free_ee(mdev, e);
873 dec_unacked(mdev);
874 return 1;
875 }
876
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200877 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700878 ok = drbd_send_block(mdev, P_DATA_REPLY, e);
879 } else {
880 if (__ratelimit(&drbd_ratelimit_state))
881 dev_err(DEV, "Sending NegDReply. sector=%llus.\n",
882 (unsigned long long)e->sector);
883
884 ok = drbd_send_ack(mdev, P_NEG_DREPLY, e);
885 }
886
887 dec_unacked(mdev);
888
889 move_to_net_ee_or_free(mdev, e);
890
891 if (unlikely(!ok))
892 dev_err(DEV, "drbd_send_block() failed\n");
893 return ok;
894}
895
896/**
897 * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS
898 * @mdev: DRBD device.
899 * @w: work object.
900 * @cancel: The connection will be closed anyways
901 */
902int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
903{
904 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
905 int ok;
906
907 if (unlikely(cancel)) {
908 drbd_free_ee(mdev, e);
909 dec_unacked(mdev);
910 return 1;
911 }
912
913 if (get_ldev_if_state(mdev, D_FAILED)) {
914 drbd_rs_complete_io(mdev, e->sector);
915 put_ldev(mdev);
916 }
917
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200918 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700919 if (likely(mdev->state.pdsk >= D_INCONSISTENT)) {
920 inc_rs_pending(mdev);
921 ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
922 } else {
923 if (__ratelimit(&drbd_ratelimit_state))
924 dev_err(DEV, "Not sending RSDataReply, "
925 "partner DISKLESS!\n");
926 ok = 1;
927 }
928 } else {
929 if (__ratelimit(&drbd_ratelimit_state))
930 dev_err(DEV, "Sending NegRSDReply. sector %llus.\n",
931 (unsigned long long)e->sector);
932
933 ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
934
935 /* update resync data with failure */
936 drbd_rs_failed_io(mdev, e->sector, e->size);
937 }
938
939 dec_unacked(mdev);
940
941 move_to_net_ee_or_free(mdev, e);
942
943 if (unlikely(!ok))
944 dev_err(DEV, "drbd_send_block() failed\n");
945 return ok;
946}
947
948int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
949{
950 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
951 struct digest_info *di;
952 int digest_size;
953 void *digest = NULL;
954 int ok, eq = 0;
955
956 if (unlikely(cancel)) {
957 drbd_free_ee(mdev, e);
958 dec_unacked(mdev);
959 return 1;
960 }
961
962 drbd_rs_complete_io(mdev, e->sector);
963
964 di = (struct digest_info *)(unsigned long)e->block_id;
965
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200966 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -0700967 /* quick hack to try to avoid a race against reconfiguration.
968 * a real fix would be much more involved,
969 * introducing more locking mechanisms */
970 if (mdev->csums_tfm) {
971 digest_size = crypto_hash_digestsize(mdev->csums_tfm);
972 D_ASSERT(digest_size == di->digest_size);
973 digest = kmalloc(digest_size, GFP_NOIO);
974 }
975 if (digest) {
Lars Ellenberg45bb9122010-05-14 17:10:48 +0200976 drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
Philipp Reisnerb411b362009-09-25 16:07:19 -0700977 eq = !memcmp(digest, di->digest, digest_size);
978 kfree(digest);
979 }
980
981 if (eq) {
982 drbd_set_in_sync(mdev, e->sector, e->size);
Lars Ellenberg676396d2010-03-03 02:08:22 +0100983 /* rs_same_csums unit is BM_BLOCK_SIZE */
984 mdev->rs_same_csum += e->size >> BM_BLOCK_SHIFT;
Philipp Reisnerb411b362009-09-25 16:07:19 -0700985 ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e);
986 } else {
987 inc_rs_pending(mdev);
988 e->block_id = ID_SYNCER;
989 ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
990 }
991 } else {
992 ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
993 if (__ratelimit(&drbd_ratelimit_state))
994 dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
995 }
996
997 dec_unacked(mdev);
998
999 kfree(di);
1000
1001 move_to_net_ee_or_free(mdev, e);
1002
1003 if (unlikely(!ok))
1004 dev_err(DEV, "drbd_send_block/ack() failed\n");
1005 return ok;
1006}
1007
1008int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1009{
1010 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
1011 int digest_size;
1012 void *digest;
1013 int ok = 1;
1014
1015 if (unlikely(cancel))
1016 goto out;
1017
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001018 if (unlikely((e->flags & EE_WAS_ERROR) != 0))
Philipp Reisnerb411b362009-09-25 16:07:19 -07001019 goto out;
1020
1021 digest_size = crypto_hash_digestsize(mdev->verify_tfm);
1022 /* FIXME if this allocation fails, online verify will not terminate! */
1023 digest = kmalloc(digest_size, GFP_NOIO);
1024 if (digest) {
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001025 drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001026 inc_rs_pending(mdev);
1027 ok = drbd_send_drequest_csum(mdev, e->sector, e->size,
1028 digest, digest_size, P_OV_REPLY);
1029 if (!ok)
1030 dec_rs_pending(mdev);
1031 kfree(digest);
1032 }
1033
1034out:
1035 drbd_free_ee(mdev, e);
1036
1037 dec_unacked(mdev);
1038
1039 return ok;
1040}
1041
1042void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size)
1043{
1044 if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) {
1045 mdev->ov_last_oos_size += size>>9;
1046 } else {
1047 mdev->ov_last_oos_start = sector;
1048 mdev->ov_last_oos_size = size>>9;
1049 }
1050 drbd_set_out_of_sync(mdev, sector, size);
1051 set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
1052}
1053
1054int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1055{
1056 struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
1057 struct digest_info *di;
1058 int digest_size;
1059 void *digest;
1060 int ok, eq = 0;
1061
1062 if (unlikely(cancel)) {
1063 drbd_free_ee(mdev, e);
1064 dec_unacked(mdev);
1065 return 1;
1066 }
1067
1068 /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
1069 * the resync lru has been cleaned up already */
1070 drbd_rs_complete_io(mdev, e->sector);
1071
1072 di = (struct digest_info *)(unsigned long)e->block_id;
1073
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001074 if (likely((e->flags & EE_WAS_ERROR) == 0)) {
Philipp Reisnerb411b362009-09-25 16:07:19 -07001075 digest_size = crypto_hash_digestsize(mdev->verify_tfm);
1076 digest = kmalloc(digest_size, GFP_NOIO);
1077 if (digest) {
Lars Ellenberg45bb9122010-05-14 17:10:48 +02001078 drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001079
1080 D_ASSERT(digest_size == di->digest_size);
1081 eq = !memcmp(digest, di->digest, digest_size);
1082 kfree(digest);
1083 }
1084 } else {
1085 ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
1086 if (__ratelimit(&drbd_ratelimit_state))
1087 dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
1088 }
1089
1090 dec_unacked(mdev);
1091
1092 kfree(di);
1093
1094 if (!eq)
1095 drbd_ov_oos_found(mdev, e->sector, e->size);
1096 else
1097 ov_oos_print(mdev);
1098
1099 ok = drbd_send_ack_ex(mdev, P_OV_RESULT, e->sector, e->size,
1100 eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
1101
1102 drbd_free_ee(mdev, e);
1103
1104 if (--mdev->ov_left == 0) {
1105 ov_oos_print(mdev);
1106 drbd_resync_finished(mdev);
1107 }
1108
1109 return ok;
1110}
1111
1112int w_prev_work_done(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1113{
1114 struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w);
1115 complete(&b->done);
1116 return 1;
1117}
1118
1119int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1120{
1121 struct drbd_tl_epoch *b = container_of(w, struct drbd_tl_epoch, w);
1122 struct p_barrier *p = &mdev->data.sbuf.barrier;
1123 int ok = 1;
1124
1125 /* really avoid racing with tl_clear. w.cb may have been referenced
1126 * just before it was reassigned and re-queued, so double check that.
1127 * actually, this race was harmless, since we only try to send the
1128 * barrier packet here, and otherwise do nothing with the object.
1129 * but compare with the head of w_clear_epoch */
1130 spin_lock_irq(&mdev->req_lock);
1131 if (w->cb != w_send_barrier || mdev->state.conn < C_CONNECTED)
1132 cancel = 1;
1133 spin_unlock_irq(&mdev->req_lock);
1134 if (cancel)
1135 return 1;
1136
1137 if (!drbd_get_data_sock(mdev))
1138 return 0;
1139 p->barrier = b->br_number;
1140 /* inc_ap_pending was done where this was queued.
1141 * dec_ap_pending will be done in got_BarrierAck
1142 * or (on connection loss) in w_clear_epoch. */
1143 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER,
1144 (struct p_header *)p, sizeof(*p), 0);
1145 drbd_put_data_sock(mdev);
1146
1147 return ok;
1148}
1149
1150int w_send_write_hint(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1151{
1152 if (cancel)
1153 return 1;
1154 return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE);
1155}
1156
1157/**
1158 * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
1159 * @mdev: DRBD device.
1160 * @w: work object.
1161 * @cancel: The connection will be closed anyways
1162 */
1163int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1164{
1165 struct drbd_request *req = container_of(w, struct drbd_request, w);
1166 int ok;
1167
1168 if (unlikely(cancel)) {
1169 req_mod(req, send_canceled);
1170 return 1;
1171 }
1172
1173 ok = drbd_send_dblock(mdev, req);
1174 req_mod(req, ok ? handed_over_to_network : send_failed);
1175
1176 return ok;
1177}
1178
1179/**
1180 * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet
1181 * @mdev: DRBD device.
1182 * @w: work object.
1183 * @cancel: The connection will be closed anyways
1184 */
1185int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
1186{
1187 struct drbd_request *req = container_of(w, struct drbd_request, w);
1188 int ok;
1189
1190 if (unlikely(cancel)) {
1191 req_mod(req, send_canceled);
1192 return 1;
1193 }
1194
1195 ok = drbd_send_drequest(mdev, P_DATA_REQUEST, req->sector, req->size,
1196 (unsigned long)req);
1197
1198 if (!ok) {
1199 /* ?? we set C_TIMEOUT or C_BROKEN_PIPE in drbd_send();
1200 * so this is probably redundant */
1201 if (mdev->state.conn >= C_CONNECTED)
1202 drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
1203 }
1204 req_mod(req, ok ? handed_over_to_network : send_failed);
1205
1206 return ok;
1207}
1208
1209static int _drbd_may_sync_now(struct drbd_conf *mdev)
1210{
1211 struct drbd_conf *odev = mdev;
1212
1213 while (1) {
1214 if (odev->sync_conf.after == -1)
1215 return 1;
1216 odev = minor_to_mdev(odev->sync_conf.after);
1217 ERR_IF(!odev) return 1;
1218 if ((odev->state.conn >= C_SYNC_SOURCE &&
1219 odev->state.conn <= C_PAUSED_SYNC_T) ||
1220 odev->state.aftr_isp || odev->state.peer_isp ||
1221 odev->state.user_isp)
1222 return 0;
1223 }
1224}
1225
1226/**
1227 * _drbd_pause_after() - Pause resync on all devices that may not resync now
1228 * @mdev: DRBD device.
1229 *
1230 * Called from process context only (admin command and after_state_ch).
1231 */
1232static int _drbd_pause_after(struct drbd_conf *mdev)
1233{
1234 struct drbd_conf *odev;
1235 int i, rv = 0;
1236
1237 for (i = 0; i < minor_count; i++) {
1238 odev = minor_to_mdev(i);
1239 if (!odev)
1240 continue;
1241 if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
1242 continue;
1243 if (!_drbd_may_sync_now(odev))
1244 rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
1245 != SS_NOTHING_TO_DO);
1246 }
1247
1248 return rv;
1249}
1250
1251/**
1252 * _drbd_resume_next() - Resume resync on all devices that may resync now
1253 * @mdev: DRBD device.
1254 *
1255 * Called from process context only (admin command and worker).
1256 */
1257static int _drbd_resume_next(struct drbd_conf *mdev)
1258{
1259 struct drbd_conf *odev;
1260 int i, rv = 0;
1261
1262 for (i = 0; i < minor_count; i++) {
1263 odev = minor_to_mdev(i);
1264 if (!odev)
1265 continue;
1266 if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
1267 continue;
1268 if (odev->state.aftr_isp) {
1269 if (_drbd_may_sync_now(odev))
1270 rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0),
1271 CS_HARD, NULL)
1272 != SS_NOTHING_TO_DO) ;
1273 }
1274 }
1275 return rv;
1276}
1277
1278void resume_next_sg(struct drbd_conf *mdev)
1279{
1280 write_lock_irq(&global_state_lock);
1281 _drbd_resume_next(mdev);
1282 write_unlock_irq(&global_state_lock);
1283}
1284
1285void suspend_other_sg(struct drbd_conf *mdev)
1286{
1287 write_lock_irq(&global_state_lock);
1288 _drbd_pause_after(mdev);
1289 write_unlock_irq(&global_state_lock);
1290}
1291
1292static int sync_after_error(struct drbd_conf *mdev, int o_minor)
1293{
1294 struct drbd_conf *odev;
1295
1296 if (o_minor == -1)
1297 return NO_ERROR;
1298 if (o_minor < -1 || minor_to_mdev(o_minor) == NULL)
1299 return ERR_SYNC_AFTER;
1300
1301 /* check for loops */
1302 odev = minor_to_mdev(o_minor);
1303 while (1) {
1304 if (odev == mdev)
1305 return ERR_SYNC_AFTER_CYCLE;
1306
1307 /* dependency chain ends here, no cycles. */
1308 if (odev->sync_conf.after == -1)
1309 return NO_ERROR;
1310
1311 /* follow the dependency chain */
1312 odev = minor_to_mdev(odev->sync_conf.after);
1313 }
1314}
1315
1316int drbd_alter_sa(struct drbd_conf *mdev, int na)
1317{
1318 int changes;
1319 int retcode;
1320
1321 write_lock_irq(&global_state_lock);
1322 retcode = sync_after_error(mdev, na);
1323 if (retcode == NO_ERROR) {
1324 mdev->sync_conf.after = na;
1325 do {
1326 changes = _drbd_pause_after(mdev);
1327 changes |= _drbd_resume_next(mdev);
1328 } while (changes);
1329 }
1330 write_unlock_irq(&global_state_lock);
1331 return retcode;
1332}
1333
Philipp Reisner309d1602010-03-02 15:03:44 +01001334static void ping_peer(struct drbd_conf *mdev)
1335{
1336 clear_bit(GOT_PING_ACK, &mdev->flags);
1337 request_ping(mdev);
1338 wait_event(mdev->misc_wait,
1339 test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED);
1340}
1341
Philipp Reisnerb411b362009-09-25 16:07:19 -07001342/**
1343 * drbd_start_resync() - Start the resync process
1344 * @mdev: DRBD device.
1345 * @side: Either C_SYNC_SOURCE or C_SYNC_TARGET
1346 *
1347 * This function might bring you directly into one of the
1348 * C_PAUSED_SYNC_* states.
1349 */
1350void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
1351{
1352 union drbd_state ns;
1353 int r;
1354
1355 if (mdev->state.conn >= C_SYNC_SOURCE) {
1356 dev_err(DEV, "Resync already running!\n");
1357 return;
1358 }
1359
Philipp Reisnerb411b362009-09-25 16:07:19 -07001360 /* In case a previous resync run was aborted by an IO error/detach on the peer. */
1361 drbd_rs_cancel_all(mdev);
1362
1363 if (side == C_SYNC_TARGET) {
1364 /* Since application IO was locked out during C_WF_BITMAP_T and
1365 C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
1366 we check that we might make the data inconsistent. */
1367 r = drbd_khelper(mdev, "before-resync-target");
1368 r = (r >> 8) & 0xff;
1369 if (r > 0) {
1370 dev_info(DEV, "before-resync-target handler returned %d, "
1371 "dropping connection.\n", r);
1372 drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
1373 return;
1374 }
1375 }
1376
1377 drbd_state_lock(mdev);
1378
1379 if (!get_ldev_if_state(mdev, D_NEGOTIATING)) {
1380 drbd_state_unlock(mdev);
1381 return;
1382 }
1383
1384 if (side == C_SYNC_TARGET) {
1385 mdev->bm_resync_fo = 0;
1386 } else /* side == C_SYNC_SOURCE */ {
1387 u64 uuid;
1388
1389 get_random_bytes(&uuid, sizeof(u64));
1390 drbd_uuid_set(mdev, UI_BITMAP, uuid);
1391 drbd_send_sync_uuid(mdev, uuid);
1392
1393 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
1394 }
1395
1396 write_lock_irq(&global_state_lock);
1397 ns = mdev->state;
1398
1399 ns.aftr_isp = !_drbd_may_sync_now(mdev);
1400
1401 ns.conn = side;
1402
1403 if (side == C_SYNC_TARGET)
1404 ns.disk = D_INCONSISTENT;
1405 else /* side == C_SYNC_SOURCE */
1406 ns.pdsk = D_INCONSISTENT;
1407
1408 r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
1409 ns = mdev->state;
1410
1411 if (ns.conn < C_CONNECTED)
1412 r = SS_UNKNOWN_ERROR;
1413
1414 if (r == SS_SUCCESS) {
1415 mdev->rs_total =
1416 mdev->rs_mark_left = drbd_bm_total_weight(mdev);
1417 mdev->rs_failed = 0;
1418 mdev->rs_paused = 0;
1419 mdev->rs_start =
1420 mdev->rs_mark_time = jiffies;
1421 mdev->rs_same_csum = 0;
1422 _drbd_pause_after(mdev);
1423 }
1424 write_unlock_irq(&global_state_lock);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001425 put_ldev(mdev);
1426
1427 if (r == SS_SUCCESS) {
1428 dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
1429 drbd_conn_str(ns.conn),
1430 (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),
1431 (unsigned long) mdev->rs_total);
1432
1433 if (mdev->rs_total == 0) {
1434 /* Peer still reachable? Beware of failing before-resync-target handlers! */
Philipp Reisner309d1602010-03-02 15:03:44 +01001435 ping_peer(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001436 drbd_resync_finished(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001437 }
1438
1439 /* ns.conn may already be != mdev->state.conn,
1440 * we may have been paused in between, or become paused until
1441 * the timer triggers.
1442 * No matter, that is handled in resync_timer_fn() */
1443 if (ns.conn == C_SYNC_TARGET)
1444 mod_timer(&mdev->resync_timer, jiffies);
1445
1446 drbd_md_sync(mdev);
1447 }
Philipp Reisnerd0c3f602010-03-02 15:06:45 +01001448 drbd_state_unlock(mdev);
Philipp Reisnerb411b362009-09-25 16:07:19 -07001449}
1450
1451int drbd_worker(struct drbd_thread *thi)
1452{
1453 struct drbd_conf *mdev = thi->mdev;
1454 struct drbd_work *w = NULL;
1455 LIST_HEAD(work_list);
1456 int intr = 0, i;
1457
1458 sprintf(current->comm, "drbd%d_worker", mdev_to_minor(mdev));
1459
1460 while (get_t_state(thi) == Running) {
1461 drbd_thread_current_set_cpu(mdev);
1462
1463 if (down_trylock(&mdev->data.work.s)) {
1464 mutex_lock(&mdev->data.mutex);
1465 if (mdev->data.socket && !mdev->net_conf->no_cork)
1466 drbd_tcp_uncork(mdev->data.socket);
1467 mutex_unlock(&mdev->data.mutex);
1468
1469 intr = down_interruptible(&mdev->data.work.s);
1470
1471 mutex_lock(&mdev->data.mutex);
1472 if (mdev->data.socket && !mdev->net_conf->no_cork)
1473 drbd_tcp_cork(mdev->data.socket);
1474 mutex_unlock(&mdev->data.mutex);
1475 }
1476
1477 if (intr) {
1478 D_ASSERT(intr == -EINTR);
1479 flush_signals(current);
1480 ERR_IF (get_t_state(thi) == Running)
1481 continue;
1482 break;
1483 }
1484
1485 if (get_t_state(thi) != Running)
1486 break;
1487 /* With this break, we have done a down() but not consumed
1488 the entry from the list. The cleanup code takes care of
1489 this... */
1490
1491 w = NULL;
1492 spin_lock_irq(&mdev->data.work.q_lock);
1493 ERR_IF(list_empty(&mdev->data.work.q)) {
1494 /* something terribly wrong in our logic.
1495 * we were able to down() the semaphore,
1496 * but the list is empty... doh.
1497 *
1498 * what is the best thing to do now?
1499 * try again from scratch, restarting the receiver,
1500 * asender, whatnot? could break even more ugly,
1501 * e.g. when we are primary, but no good local data.
1502 *
1503 * I'll try to get away just starting over this loop.
1504 */
1505 spin_unlock_irq(&mdev->data.work.q_lock);
1506 continue;
1507 }
1508 w = list_entry(mdev->data.work.q.next, struct drbd_work, list);
1509 list_del_init(&w->list);
1510 spin_unlock_irq(&mdev->data.work.q_lock);
1511
1512 if (!w->cb(mdev, w, mdev->state.conn < C_CONNECTED)) {
1513 /* dev_warn(DEV, "worker: a callback failed! \n"); */
1514 if (mdev->state.conn >= C_CONNECTED)
1515 drbd_force_state(mdev,
1516 NS(conn, C_NETWORK_FAILURE));
1517 }
1518 }
1519 D_ASSERT(test_bit(DEVICE_DYING, &mdev->flags));
1520 D_ASSERT(test_bit(CONFIG_PENDING, &mdev->flags));
1521
1522 spin_lock_irq(&mdev->data.work.q_lock);
1523 i = 0;
1524 while (!list_empty(&mdev->data.work.q)) {
1525 list_splice_init(&mdev->data.work.q, &work_list);
1526 spin_unlock_irq(&mdev->data.work.q_lock);
1527
1528 while (!list_empty(&work_list)) {
1529 w = list_entry(work_list.next, struct drbd_work, list);
1530 list_del_init(&w->list);
1531 w->cb(mdev, w, 1);
1532 i++; /* dead debugging code */
1533 }
1534
1535 spin_lock_irq(&mdev->data.work.q_lock);
1536 }
1537 sema_init(&mdev->data.work.s, 0);
1538 /* DANGEROUS race: if someone did queue his work within the spinlock,
1539 * but up() ed outside the spinlock, we could get an up() on the
1540 * semaphore without corresponding list entry.
1541 * So don't do that.
1542 */
1543 spin_unlock_irq(&mdev->data.work.q_lock);
1544
1545 D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE);
1546 /* _drbd_set_state only uses stop_nowait.
1547 * wait here for the Exiting receiver. */
1548 drbd_thread_stop(&mdev->receiver);
1549 drbd_mdev_cleanup(mdev);
1550
1551 dev_info(DEV, "worker terminated\n");
1552
1553 clear_bit(DEVICE_DYING, &mdev->flags);
1554 clear_bit(CONFIG_PENDING, &mdev->flags);
1555 wake_up(&mdev->state_wait);
1556
1557 return 0;
1558}