Blame - drivers/block/drbd/drbd_actlog.c - kernel/msm

blob: bd925180a2b07167f8702264125e25b8716f3659 [file] [log] [blame]

Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	1	/*
				2	drbd_actlog.c
				3
				4	This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
				5
				6	Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
				7	Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
				8	Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
				9
				10	drbd is free software; you can redistribute it and/or modify
				11	it under the terms of the GNU General Public License as published by
				12	the Free Software Foundation; either version 2, or (at your option)
				13	any later version.
				14
				15	drbd is distributed in the hope that it will be useful,
				16	but WITHOUT ANY WARRANTY; without even the implied warranty of
				17	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				18	GNU General Public License for more details.
				19
				20	You should have received a copy of the GNU General Public License
				21	along with drbd; see the file COPYING. If not, write to
				22	the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
				23
				24	*/
				25
				26	#include <linux/slab.h>
				27	#include <linux/drbd.h>
				28	#include "drbd_int.h"
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	29	#include "drbd_wrappers.h"
				30
				31	/* We maintain a trivial check sum in our on disk activity log.
				32	* With that we can ensure correct operation even when the storage
				33	* device might do a partial (last) sector write while loosing power.
				34	*/
				35	struct __packed al_transaction {
				36	u32 magic;
				37	u32 tr_number;
				38	struct __packed {
				39	u32 pos;
				40	u32 extent; } updates[1 + AL_EXTENTS_PT];
				41	u32 xor_sum;
				42	};
				43
				44	struct update_odbm_work {
				45	struct drbd_work w;
				46	unsigned int enr;
				47	};
				48
				49	struct update_al_work {
				50	struct drbd_work w;
				51	struct lc_element *al_ext;
				52	struct completion event;
				53	unsigned int enr;
				54	/* if old_enr != LC_FREE, write corresponding bitmap sector, too */
				55	unsigned int old_enr;
				56	};
				57
				58	struct drbd_atodb_wait {
				59	atomic_t count;
				60	struct completion io_done;
				61	struct drbd_conf *mdev;
				62	int error;
				63	};
				64
				65
				66	int w_al_write_transaction(struct drbd_conf , struct drbd_work , int);
				67
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	68	static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
				69	struct drbd_backing_dev *bdev,
				70	struct page *page, sector_t sector,
				71	int rw, int size)
				72	{
				73	struct bio *bio;
				74	struct drbd_md_io md_io;
				75	int ok;
				76
				77	md_io.mdev = mdev;
				78	init_completion(&md_io.event);
				79	md_io.error = 0;
				80
				81	if ((rw & WRITE) && !test_bit(MD_NO_BARRIER, &mdev->flags))
Christoph Hellwig	7b6d91d	2010-08-07 18:20:39 +0200	[diff] [blame]	82	rw \|= REQ_HARDBARRIER;
				83	rw \|= REQ_UNPLUG \| REQ_SYNC;
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	84
				85	retry:
				86	bio = bio_alloc(GFP_NOIO, 1);
				87	bio->bi_bdev = bdev->md_bdev;
				88	bio->bi_sector = sector;
				89	ok = (bio_add_page(bio, page, size, 0) == size);
				90	if (!ok)
				91	goto out;
				92	bio->bi_private = &md_io;
				93	bio->bi_end_io = drbd_md_io_complete;
				94	bio->bi_rw = rw;
				95
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	96	if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
				97	bio_endio(bio, -EIO);
				98	else
				99	submit_bio(rw, bio);
				100	wait_for_completion(&md_io.event);
				101	ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0;
				102
				103	/* check for unsupported barrier op.
				104	* would rather check on EOPNOTSUPP, but that is not reliable.
				105	* don't try again for ANY return value != 0 */
Christoph Hellwig	7b6d91d	2010-08-07 18:20:39 +0200	[diff] [blame]	106	if (unlikely((bio->bi_rw & REQ_HARDBARRIER) && !ok)) {
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	107	/* Try again with no barrier */
				108	dev_warn(DEV, "Barriers not supported on meta data device - disabling\n");
				109	set_bit(MD_NO_BARRIER, &mdev->flags);
Christoph Hellwig	7b6d91d	2010-08-07 18:20:39 +0200	[diff] [blame]	110	rw &= ~REQ_HARDBARRIER;
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	111	bio_put(bio);
				112	goto retry;
				113	}
				114	out:
				115	bio_put(bio);
				116	return ok;
				117	}
				118
				119	int drbd_md_sync_page_io(struct drbd_conf mdev, struct drbd_backing_dev bdev,
				120	sector_t sector, int rw)
				121	{
				122	int logical_block_size, mask, ok;
				123	int offset = 0;
				124	struct page *iop = mdev->md_io_page;
				125
				126	D_ASSERT(mutex_is_locked(&mdev->md_io_mutex));
				127
				128	BUG_ON(!bdev->md_bdev);
				129
				130	logical_block_size = bdev_logical_block_size(bdev->md_bdev);
				131	if (logical_block_size == 0)
				132	logical_block_size = MD_SECTOR_SIZE;
				133
				134	/* in case logical_block_size != 512 [ s390 only? ] */
				135	if (logical_block_size != MD_SECTOR_SIZE) {
				136	mask = (logical_block_size / MD_SECTOR_SIZE) - 1;
				137	D_ASSERT(mask == 1 \|\| mask == 3 \|\| mask == 7);
				138	D_ASSERT(logical_block_size == (mask+1) * MD_SECTOR_SIZE);
				139	offset = sector & mask;
				140	sector = sector & ~mask;
				141	iop = mdev->md_io_tmpp;
				142
				143	if (rw & WRITE) {
				144	/* these are GFP_KERNEL pages, pre-allocated
				145	* on device initialization */
				146	void *p = page_address(mdev->md_io_page);
				147	void *hp = page_address(mdev->md_io_tmpp);
				148
				149	ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector,
				150	READ, logical_block_size);
				151
				152	if (unlikely(!ok)) {
				153	dev_err(DEV, "drbd_md_sync_page_io(,%llus,"
				154	"READ [logical_block_size!=512]) failed!\n",
				155	(unsigned long long)sector);
				156	return 0;
				157	}
				158
				159	memcpy(hp + offset*MD_SECTOR_SIZE, p, MD_SECTOR_SIZE);
				160	}
				161	}
				162
				163	if (sector < drbd_md_first_sector(bdev) \|\|
				164	sector > drbd_md_last_sector(bdev))
				165	dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n",
				166	current->comm, current->pid, __func__,
				167	(unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
				168
				169	ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, logical_block_size);
				170	if (unlikely(!ok)) {
				171	dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed!\n",
				172	(unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
				173	return 0;
				174	}
				175
				176	if (logical_block_size != MD_SECTOR_SIZE && !(rw & WRITE)) {
				177	void *p = page_address(mdev->md_io_page);
				178	void *hp = page_address(mdev->md_io_tmpp);
				179
				180	memcpy(p, hp + offset*MD_SECTOR_SIZE, MD_SECTOR_SIZE);
				181	}
				182
				183	return ok;
				184	}
				185
				186	static struct lc_element _al_get(struct drbd_conf mdev, unsigned int enr)
				187	{
				188	struct lc_element *al_ext;
				189	struct lc_element *tmp;
				190	unsigned long al_flags = 0;
				191
				192	spin_lock_irq(&mdev->al_lock);
				193	tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
				194	if (unlikely(tmp != NULL)) {
				195	struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
				196	if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
				197	spin_unlock_irq(&mdev->al_lock);
				198	return NULL;
				199	}
				200	}
				201	al_ext = lc_get(mdev->act_log, enr);
				202	al_flags = mdev->act_log->flags;
				203	spin_unlock_irq(&mdev->al_lock);
				204
				205	/*
				206	if (!al_ext) {
				207	if (al_flags & LC_STARVING)
				208	dev_warn(DEV, "Have to wait for LRU element (AL too small?)\n");
				209	if (al_flags & LC_DIRTY)
				210	dev_warn(DEV, "Ongoing AL update (AL device too slow?)\n");
				211	}
				212	*/
				213
				214	return al_ext;
				215	}
				216
				217	void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector)
				218	{
				219	unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
				220	struct lc_element *al_ext;
				221	struct update_al_work al_work;
				222
				223	D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
				224
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	225	wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr)));
				226
				227	if (al_ext->lc_number != enr) {
				228	/* drbd_al_write_transaction(mdev,al_ext,enr);
				229	* recurses into generic_make_request(), which
				230	* disallows recursion, bios being serialized on the
				231	* current->bio_tail list now.
				232	* we have to delegate updates to the activity log
				233	* to the worker thread. */
				234	init_completion(&al_work.event);
				235	al_work.al_ext = al_ext;
				236	al_work.enr = enr;
				237	al_work.old_enr = al_ext->lc_number;
				238	al_work.w.cb = w_al_write_transaction;
				239	drbd_queue_work_front(&mdev->data.work, &al_work.w);
				240	wait_for_completion(&al_work.event);
				241
				242	mdev->al_writ_cnt++;
				243
				244	spin_lock_irq(&mdev->al_lock);
				245	lc_changed(mdev->act_log, al_ext);
				246	spin_unlock_irq(&mdev->al_lock);
				247	wake_up(&mdev->al_wait);
				248	}
				249	}
				250
				251	void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector)
				252	{
				253	unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
				254	struct lc_element *extent;
				255	unsigned long flags;
				256
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	257	spin_lock_irqsave(&mdev->al_lock, flags);
				258
				259	extent = lc_find(mdev->act_log, enr);
				260
				261	if (!extent) {
				262	spin_unlock_irqrestore(&mdev->al_lock, flags);
				263	dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr);
				264	return;
				265	}
				266
				267	if (lc_put(mdev->act_log, extent) == 0)
				268	wake_up(&mdev->al_wait);
				269
				270	spin_unlock_irqrestore(&mdev->al_lock, flags);
				271	}
				272
				273	int
				274	w_al_write_transaction(struct drbd_conf mdev, struct drbd_work w, int unused)
				275	{
				276	struct update_al_work *aw = container_of(w, struct update_al_work, w);
				277	struct lc_element *updated = aw->al_ext;
				278	const unsigned int new_enr = aw->enr;
				279	const unsigned int evicted = aw->old_enr;
				280	struct al_transaction *buffer;
				281	sector_t sector;
				282	int i, n, mx;
				283	unsigned int extent_nr;
				284	u32 xor_sum = 0;
				285
				286	if (!get_ldev(mdev)) {
Lars Ellenberg	6719fb0	2010-10-18 23:04:07 +0200	[diff] [blame^]	287	dev_err(DEV,
				288	"disk is %s, cannot start al transaction (-%d +%d)\n",
				289	drbd_disk_str(mdev->state.disk), evicted, new_enr);
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	290	complete(&((struct update_al_work *)w)->event);
				291	return 1;
				292	}
				293	/* do we have to do a bitmap write, first?
				294	* TODO reduce maximum latency:
				295	* submit both bios, then wait for both,
Lars Ellenberg	6719fb0	2010-10-18 23:04:07 +0200	[diff] [blame^]	296	* instead of doing two synchronous sector writes.
				297	* For now, we must not write the transaction,
				298	* if we cannot write out the bitmap of the evicted extent. */
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	299	if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE)
				300	drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT);
				301
Lars Ellenberg	6719fb0	2010-10-18 23:04:07 +0200	[diff] [blame^]	302	/* The bitmap write may have failed, causing a state change. */
				303	if (mdev->state.disk < D_INCONSISTENT) {
				304	dev_err(DEV,
				305	"disk is %s, cannot write al transaction (-%d +%d)\n",
				306	drbd_disk_str(mdev->state.disk), evicted, new_enr);
				307	complete(&((struct update_al_work *)w)->event);
				308	put_ldev(mdev);
				309	return 1;
				310	}
				311
				312	mutex_lock(&mdev->md_io_mutex); /* protects md_io_buffer, al_tr_cycle, ... */
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	313	buffer = (struct al_transaction *)page_address(mdev->md_io_page);
				314
				315	buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC);
				316	buffer->tr_number = cpu_to_be32(mdev->al_tr_number);
				317
				318	n = lc_index_of(mdev->act_log, updated);
				319
				320	buffer->updates[0].pos = cpu_to_be32(n);
				321	buffer->updates[0].extent = cpu_to_be32(new_enr);
				322
				323	xor_sum ^= new_enr;
				324
				325	mx = min_t(int, AL_EXTENTS_PT,
				326	mdev->act_log->nr_elements - mdev->al_tr_cycle);
				327	for (i = 0; i < mx; i++) {
				328	unsigned idx = mdev->al_tr_cycle + i;
				329	extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number;
				330	buffer->updates[i+1].pos = cpu_to_be32(idx);
				331	buffer->updates[i+1].extent = cpu_to_be32(extent_nr);
				332	xor_sum ^= extent_nr;
				333	}
				334	for (; i < AL_EXTENTS_PT; i++) {
				335	buffer->updates[i+1].pos = __constant_cpu_to_be32(-1);
				336	buffer->updates[i+1].extent = __constant_cpu_to_be32(LC_FREE);
				337	xor_sum ^= LC_FREE;
				338	}
				339	mdev->al_tr_cycle += AL_EXTENTS_PT;
				340	if (mdev->al_tr_cycle >= mdev->act_log->nr_elements)
				341	mdev->al_tr_cycle = 0;
				342
				343	buffer->xor_sum = cpu_to_be32(xor_sum);
				344
				345	sector = mdev->ldev->md.md_offset
				346	+ mdev->ldev->md.al_offset + mdev->al_tr_pos;
				347
				348	if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE))
				349	drbd_chk_io_error(mdev, 1, TRUE);
				350
				351	if (++mdev->al_tr_pos >
				352	div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
				353	mdev->al_tr_pos = 0;
				354
				355	D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE);
				356	mdev->al_tr_number++;
				357
				358	mutex_unlock(&mdev->md_io_mutex);
				359
				360	complete(&((struct update_al_work *)w)->event);
				361	put_ldev(mdev);
				362
				363	return 1;
				364	}
				365
				366	/**
				367	* drbd_al_read_tr() - Read a single transaction from the on disk activity log
				368	* @mdev: DRBD device.
				369	* @bdev: Block device to read form.
				370	* @b: pointer to an al_transaction.
				371	* @index: On disk slot of the transaction to read.
				372	*
				373	* Returns -1 on IO error, 0 on checksum error and 1 upon success.
				374	*/
				375	static int drbd_al_read_tr(struct drbd_conf *mdev,
				376	struct drbd_backing_dev *bdev,
				377	struct al_transaction *b,
				378	int index)
				379	{
				380	sector_t sector;
				381	int rv, i;
				382	u32 xor_sum = 0;
				383
				384	sector = bdev->md.md_offset + bdev->md.al_offset + index;
				385
				386	/* Dont process error normally,
				387	* as this is done before disk is attached! */
				388	if (!drbd_md_sync_page_io(mdev, bdev, sector, READ))
				389	return -1;
				390
				391	rv = (be32_to_cpu(b->magic) == DRBD_MAGIC);
				392
				393	for (i = 0; i < AL_EXTENTS_PT + 1; i++)
				394	xor_sum ^= be32_to_cpu(b->updates[i].extent);
				395	rv &= (xor_sum == be32_to_cpu(b->xor_sum));
				396
				397	return rv;
				398	}
				399
				400	/**
				401	* drbd_al_read_log() - Restores the activity log from its on disk representation.
				402	* @mdev: DRBD device.
				403	* @bdev: Block device to read form.
				404	*
				405	* Returns 1 on success, returns 0 when reading the log failed due to IO errors.
				406	*/
				407	int drbd_al_read_log(struct drbd_conf mdev, struct drbd_backing_dev bdev)
				408	{
				409	struct al_transaction *buffer;
				410	int i;
				411	int rv;
				412	int mx;
				413	int active_extents = 0;
				414	int transactions = 0;
				415	int found_valid = 0;
				416	int from = 0;
				417	int to = 0;
				418	u32 from_tnr = 0;
				419	u32 to_tnr = 0;
				420	u32 cnr;
				421
				422	mx = div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT);
				423
				424	/* lock out all other meta data io for now,
				425	* and make sure the page is mapped.
				426	*/
				427	mutex_lock(&mdev->md_io_mutex);
				428	buffer = page_address(mdev->md_io_page);
				429
				430	/* Find the valid transaction in the log */
				431	for (i = 0; i <= mx; i++) {
				432	rv = drbd_al_read_tr(mdev, bdev, buffer, i);
				433	if (rv == 0)
				434	continue;
				435	if (rv == -1) {
				436	mutex_unlock(&mdev->md_io_mutex);
				437	return 0;
				438	}
				439	cnr = be32_to_cpu(buffer->tr_number);
				440
				441	if (++found_valid == 1) {
				442	from = i;
				443	to = i;
				444	from_tnr = cnr;
				445	to_tnr = cnr;
				446	continue;
				447	}
				448	if ((int)cnr - (int)from_tnr < 0) {
				449	D_ASSERT(from_tnr - cnr + i - from == mx+1);
				450	from = i;
				451	from_tnr = cnr;
				452	}
				453	if ((int)cnr - (int)to_tnr > 0) {
				454	D_ASSERT(cnr - to_tnr == i - to);
				455	to = i;
				456	to_tnr = cnr;
				457	}
				458	}
				459
				460	if (!found_valid) {
				461	dev_warn(DEV, "No usable activity log found.\n");
				462	mutex_unlock(&mdev->md_io_mutex);
				463	return 1;
				464	}
				465
				466	/* Read the valid transactions.
				467	* dev_info(DEV, "Reading from %d to %d.\n",from,to); */
				468	i = from;
				469	while (1) {
				470	int j, pos;
				471	unsigned int extent_nr;
				472	unsigned int trn;
				473
				474	rv = drbd_al_read_tr(mdev, bdev, buffer, i);
				475	ERR_IF(rv == 0) goto cancel;
				476	if (rv == -1) {
				477	mutex_unlock(&mdev->md_io_mutex);
				478	return 0;
				479	}
				480
				481	trn = be32_to_cpu(buffer->tr_number);
				482
				483	spin_lock_irq(&mdev->al_lock);
				484
				485	/* This loop runs backwards because in the cyclic
				486	elements there might be an old version of the
				487	updated element (in slot 0). So the element in slot 0
				488	can overwrite old versions. */
				489	for (j = AL_EXTENTS_PT; j >= 0; j--) {
				490	pos = be32_to_cpu(buffer->updates[j].pos);
				491	extent_nr = be32_to_cpu(buffer->updates[j].extent);
				492
				493	if (extent_nr == LC_FREE)
				494	continue;
				495
				496	lc_set(mdev->act_log, extent_nr, pos);
				497	active_extents++;
				498	}
				499	spin_unlock_irq(&mdev->al_lock);
				500
				501	transactions++;
				502
				503	cancel:
				504	if (i == to)
				505	break;
				506	i++;
				507	if (i > mx)
				508	i = 0;
				509	}
				510
				511	mdev->al_tr_number = to_tnr+1;
				512	mdev->al_tr_pos = to;
				513	if (++mdev->al_tr_pos >
				514	div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
				515	mdev->al_tr_pos = 0;
				516
				517	/* ok, we are done with it */
				518	mutex_unlock(&mdev->md_io_mutex);
				519
				520	dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n",
				521	transactions, active_extents);
				522
				523	return 1;
				524	}
				525
				526	static void atodb_endio(struct bio *bio, int error)
				527	{
				528	struct drbd_atodb_wait *wc = bio->bi_private;
				529	struct drbd_conf *mdev = wc->mdev;
				530	struct page *page;
				531	int uptodate = bio_flagged(bio, BIO_UPTODATE);
				532
				533	/* strange behavior of some lower level drivers...
				534	* fail the request by clearing the uptodate flag,
				535	* but do not return any error?! */
				536	if (!error && !uptodate)
				537	error = -EIO;
				538
				539	drbd_chk_io_error(mdev, error, TRUE);
				540	if (error && wc->error == 0)
				541	wc->error = error;
				542
				543	if (atomic_dec_and_test(&wc->count))
				544	complete(&wc->io_done);
				545
				546	page = bio->bi_io_vec[0].bv_page;
				547	put_page(page);
				548	bio_put(bio);
				549	mdev->bm_writ_cnt++;
				550	put_ldev(mdev);
				551	}
				552
Lars Ellenberg	39ad2bb	2010-03-04 15:52:30 +0100	[diff] [blame]	553	/* sector to word */
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	554	#define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
Lars Ellenberg	39ad2bb	2010-03-04 15:52:30 +0100	[diff] [blame]	555
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	556	/* activity log to on disk bitmap -- prepare bio unless that sector
				557	* is already covered by previously prepared bios */
				558	static int atodb_prepare_unless_covered(struct drbd_conf *mdev,
				559	struct bio **bios,
				560	unsigned int enr,
				561	struct drbd_atodb_wait *wc) __must_hold(local)
				562	{
				563	struct bio *bio;
				564	struct page *page;
Lars Ellenberg	39ad2bb	2010-03-04 15:52:30 +0100	[diff] [blame]	565	sector_t on_disk_sector;
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	566	unsigned int page_offset = PAGE_SIZE;
				567	int offset;
				568	int i = 0;
				569	int err = -ENOMEM;
				570
Lars Ellenberg	39ad2bb	2010-03-04 15:52:30 +0100	[diff] [blame]	571	/* We always write aligned, full 4k blocks,
				572	* so we can ignore the logical_block_size (for now) */
				573	enr &= ~7U;
				574	on_disk_sector = enr + mdev->ldev->md.md_offset
				575	+ mdev->ldev->md.bm_offset;
				576
				577	D_ASSERT(!(on_disk_sector & 7U));
				578
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	579	/* Check if that enr is already covered by an already created bio.
				580	* Caution, bios[] is not NULL terminated,
				581	* but only initialized to all NULL.
				582	* For completely scattered activity log,
				583	* the last invocation iterates over all bios,
				584	* and finds the last NULL entry.
				585	*/
				586	while ((bio = bios[i])) {
				587	if (bio->bi_sector == on_disk_sector)
				588	return 0;
				589	i++;
				590	}
				591	/* bios[i] == NULL, the next not yet used slot */
				592
				593	/* GFP_KERNEL, we are not in the write-out path */
				594	bio = bio_alloc(GFP_KERNEL, 1);
				595	if (bio == NULL)
				596	return -ENOMEM;
				597
				598	if (i > 0) {
				599	const struct bio_vec *prev_bv = bios[i-1]->bi_io_vec;
				600	page_offset = prev_bv->bv_offset + prev_bv->bv_len;
				601	page = prev_bv->bv_page;
				602	}
				603	if (page_offset == PAGE_SIZE) {
				604	page = alloc_page(__GFP_HIGHMEM);
				605	if (page == NULL)
				606	goto out_bio_put;
				607	page_offset = 0;
				608	} else {
				609	get_page(page);
				610	}
				611
				612	offset = S2W(enr);
				613	drbd_bm_get_lel(mdev, offset,
Lars Ellenberg	39ad2bb	2010-03-04 15:52:30 +0100	[diff] [blame]	614	min_t(size_t, S2W(8), drbd_bm_words(mdev) - offset),
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	615	kmap(page) + page_offset);
				616	kunmap(page);
				617
				618	bio->bi_private = wc;
				619	bio->bi_end_io = atodb_endio;
				620	bio->bi_bdev = mdev->ldev->md_bdev;
				621	bio->bi_sector = on_disk_sector;
				622
Lars Ellenberg	39ad2bb	2010-03-04 15:52:30 +0100	[diff] [blame]	623	if (bio_add_page(bio, page, 4096, page_offset) != 4096)
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	624	goto out_put_page;
				625
				626	atomic_inc(&wc->count);
				627	/* we already know that we may do this...
				628	* get_ldev_if_state(mdev,D_ATTACHING);
				629	* just get the extra reference, so that the local_cnt reflects
				630	* the number of pending IO requests DRBD at its backing device.
				631	*/
				632	atomic_inc(&mdev->local_cnt);
				633
				634	bios[i] = bio;
				635
				636	return 0;
				637
				638	out_put_page:
				639	err = -EINVAL;
				640	put_page(page);
				641	out_bio_put:
				642	bio_put(bio);
				643	return err;
				644	}
				645
				646	/**
				647	* drbd_al_to_on_disk_bm() - * Writes bitmap parts covered by active AL extents
				648	* @mdev: DRBD device.
				649	*
				650	* Called when we detach (unconfigure) local storage,
				651	* or when we go from R_PRIMARY to R_SECONDARY role.
				652	*/
				653	void drbd_al_to_on_disk_bm(struct drbd_conf *mdev)
				654	{
				655	int i, nr_elements;
				656	unsigned int enr;
				657	struct bio **bios;
				658	struct drbd_atodb_wait wc;
				659
				660	ERR_IF (!get_ldev_if_state(mdev, D_ATTACHING))
				661	return; /* sorry, I don't have any act_log etc... */
				662
				663	wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
				664
				665	nr_elements = mdev->act_log->nr_elements;
				666
				667	/* GFP_KERNEL, we are not in anyone's write-out path */
				668	bios = kzalloc(sizeof(struct bio ) nr_elements, GFP_KERNEL);
				669	if (!bios)
				670	goto submit_one_by_one;
				671
				672	atomic_set(&wc.count, 0);
				673	init_completion(&wc.io_done);
				674	wc.mdev = mdev;
				675	wc.error = 0;
				676
				677	for (i = 0; i < nr_elements; i++) {
				678	enr = lc_element_by_index(mdev->act_log, i)->lc_number;
				679	if (enr == LC_FREE)
				680	continue;
				681	/* next statement also does atomic_inc wc.count and local_cnt */
				682	if (atodb_prepare_unless_covered(mdev, bios,
				683	enr/AL_EXT_PER_BM_SECT,
				684	&wc))
				685	goto free_bios_submit_one_by_one;
				686	}
				687
				688	/* unnecessary optimization? */
				689	lc_unlock(mdev->act_log);
				690	wake_up(&mdev->al_wait);
				691
				692	/* all prepared, submit them */
				693	for (i = 0; i < nr_elements; i++) {
				694	if (bios[i] == NULL)
				695	break;
				696	if (FAULT_ACTIVE(mdev, DRBD_FAULT_MD_WR)) {
				697	bios[i]->bi_rw = WRITE;
				698	bio_endio(bios[i], -EIO);
				699	} else {
				700	submit_bio(WRITE, bios[i]);
				701	}
				702	}
				703
				704	drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev));
				705
				706	/* always (try to) flush bitmap to stable storage */
				707	drbd_md_flush(mdev);
				708
				709	/* In case we did not submit a single IO do not wait for
				710	* them to complete. ( Because we would wait forever here. )
				711	*
				712	* In case we had IOs and they are already complete, there
				713	* is not point in waiting anyways.
				714	* Therefore this if () ... */
				715	if (atomic_read(&wc.count))
				716	wait_for_completion(&wc.io_done);
				717
				718	put_ldev(mdev);
				719
				720	kfree(bios);
				721	return;
				722
				723	free_bios_submit_one_by_one:
				724	/* free everything by calling the endio callback directly. */
				725	for (i = 0; i < nr_elements && bios[i]; i++)
				726	bio_endio(bios[i], 0);
				727
				728	kfree(bios);
				729
				730	submit_one_by_one:
				731	dev_warn(DEV, "Using the slow drbd_al_to_on_disk_bm()\n");
				732
				733	for (i = 0; i < mdev->act_log->nr_elements; i++) {
				734	enr = lc_element_by_index(mdev->act_log, i)->lc_number;
				735	if (enr == LC_FREE)
				736	continue;
				737	/* Really slow: if we have al-extents 16..19 active,
				738	* sector 4 will be written four times! Synchronous! */
				739	drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT);
				740	}
				741
				742	lc_unlock(mdev->act_log);
				743	wake_up(&mdev->al_wait);
				744	put_ldev(mdev);
				745	}
				746
				747	/**
				748	* drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents
				749	* @mdev: DRBD device.
				750	*/
				751	void drbd_al_apply_to_bm(struct drbd_conf *mdev)
				752	{
				753	unsigned int enr;
				754	unsigned long add = 0;
				755	char ppb[10];
Lars Ellenberg	6719fb0	2010-10-18 23:04:07 +0200	[diff] [blame^]	756	int i, tmp;
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	757
				758	wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
				759
				760	for (i = 0; i < mdev->act_log->nr_elements; i++) {
				761	enr = lc_element_by_index(mdev->act_log, i)->lc_number;
				762	if (enr == LC_FREE)
				763	continue;
Lars Ellenberg	6719fb0	2010-10-18 23:04:07 +0200	[diff] [blame^]	764	tmp = drbd_bm_ALe_set_all(mdev, enr);
				765	dynamic_dev_dbg(DEV, "AL: set %d bits in extent %u\n", tmp, enr);
				766	add += tmp;
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	767	}
				768
				769	lc_unlock(mdev->act_log);
				770	wake_up(&mdev->al_wait);
				771
				772	dev_info(DEV, "Marked additional %s as out-of-sync based on AL.\n",
				773	ppsize(ppb, Bit2KB(add)));
				774	}
				775
				776	static int _try_lc_del(struct drbd_conf mdev, struct lc_element al_ext)
				777	{
				778	int rv;
				779
				780	spin_lock_irq(&mdev->al_lock);
				781	rv = (al_ext->refcnt == 0);
				782	if (likely(rv))
				783	lc_del(mdev->act_log, al_ext);
				784	spin_unlock_irq(&mdev->al_lock);
				785
				786	return rv;
				787	}
				788
				789	/**
				790	* drbd_al_shrink() - Removes all active extents form the activity log
				791	* @mdev: DRBD device.
				792	*
				793	* Removes all active extents form the activity log, waiting until
				794	* the reference count of each entry dropped to 0 first, of course.
				795	*
				796	* You need to lock mdev->act_log with lc_try_lock() / lc_unlock()
				797	*/
				798	void drbd_al_shrink(struct drbd_conf *mdev)
				799	{
				800	struct lc_element *al_ext;
				801	int i;
				802
				803	D_ASSERT(test_bit(__LC_DIRTY, &mdev->act_log->flags));
				804
				805	for (i = 0; i < mdev->act_log->nr_elements; i++) {
				806	al_ext = lc_element_by_index(mdev->act_log, i);
				807	if (al_ext->lc_number == LC_FREE)
				808	continue;
				809	wait_event(mdev->al_wait, _try_lc_del(mdev, al_ext));
				810	}
				811
				812	wake_up(&mdev->al_wait);
				813	}
				814
				815	static int w_update_odbm(struct drbd_conf mdev, struct drbd_work w, int unused)
				816	{
				817	struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w);
				818
				819	if (!get_ldev(mdev)) {
				820	if (__ratelimit(&drbd_ratelimit_state))
				821	dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n");
				822	kfree(udw);
				823	return 1;
				824	}
				825
				826	drbd_bm_write_sect(mdev, udw->enr);
				827	put_ldev(mdev);
				828
				829	kfree(udw);
				830
				831	if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) {
				832	switch (mdev->state.conn) {
				833	case C_SYNC_SOURCE: case C_SYNC_TARGET:
				834	case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T:
				835	drbd_resync_finished(mdev);
				836	default:
				837	/* nothing to do */
				838	break;
				839	}
				840	}
				841	drbd_bcast_sync_progress(mdev);
				842
				843	return 1;
				844	}
				845
				846
				847	/* ATTENTION. The AL's extents are 4MB each, while the extents in the
				848	* resync LRU-cache are 16MB each.
				849	* The caller of this function has to hold an get_ldev() reference.
				850	*
				851	* TODO will be obsoleted once we have a caching lru of the on disk bitmap
				852	*/
				853	static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
				854	int count, int success)
				855	{
				856	struct lc_element *e;
				857	struct update_odbm_work *udw;
				858
				859	unsigned int enr;
				860
				861	D_ASSERT(atomic_read(&mdev->local_cnt));
				862
				863	/* I simply assume that a sector/size pair never crosses
				864	* a 16 MB extent border. (Currently this is true...) */
				865	enr = BM_SECT_TO_EXT(sector);
				866
				867	e = lc_get(mdev->resync, enr);
				868	if (e) {
				869	struct bm_extent *ext = lc_entry(e, struct bm_extent, lce);
				870	if (ext->lce.lc_number == enr) {
				871	if (success)
				872	ext->rs_left -= count;
				873	else
				874	ext->rs_failed += count;
				875	if (ext->rs_left < ext->rs_failed) {
				876	dev_err(DEV, "BAD! sector=%llus enr=%u rs_left=%d "
				877	"rs_failed=%d count=%d\n",
				878	(unsigned long long)sector,
				879	ext->lce.lc_number, ext->rs_left,
				880	ext->rs_failed, count);
				881	dump_stack();
				882
				883	lc_put(mdev->resync, &ext->lce);
				884	drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
				885	return;
				886	}
				887	} else {
				888	/* Normally this element should be in the cache,
				889	* since drbd_rs_begin_io() pulled it already in.
				890	*
				891	* But maybe an application write finished, and we set
				892	* something outside the resync lru_cache in sync.
				893	*/
				894	int rs_left = drbd_bm_e_weight(mdev, enr);
				895	if (ext->flags != 0) {
				896	dev_warn(DEV, "changing resync lce: %d[%u;%02lx]"
				897	" -> %d[%u;00]\n",
				898	ext->lce.lc_number, ext->rs_left,
				899	ext->flags, enr, rs_left);
				900	ext->flags = 0;
				901	}
				902	if (ext->rs_failed) {
				903	dev_warn(DEV, "Kicking resync_lru element enr=%u "
				904	"out with rs_failed=%d\n",
				905	ext->lce.lc_number, ext->rs_failed);
				906	set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
				907	}
				908	ext->rs_left = rs_left;
				909	ext->rs_failed = success ? 0 : count;
				910	lc_changed(mdev->resync, &ext->lce);
				911	}
				912	lc_put(mdev->resync, &ext->lce);
				913	/* no race, we are within the al_lock! */
				914
				915	if (ext->rs_left == ext->rs_failed) {
				916	ext->rs_failed = 0;
				917
				918	udw = kmalloc(sizeof(*udw), GFP_ATOMIC);
				919	if (udw) {
				920	udw->enr = ext->lce.lc_number;
				921	udw->w.cb = w_update_odbm;
				922	drbd_queue_work_front(&mdev->data.work, &udw->w);
				923	} else {
				924	dev_warn(DEV, "Could not kmalloc an udw\n");
				925	set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
				926	}
				927	}
				928	} else {
				929	dev_err(DEV, "lc_get() failed! locked=%d/%d flags=%lu\n",
				930	mdev->resync_locked,
				931	mdev->resync->nr_elements,
				932	mdev->resync->flags);
				933	}
				934	}
				935
				936	/* clear the bit corresponding to the piece of storage in question:
				937	* size byte of data starting from sector. Only clear a bits of the affected
				938	* one ore more _aligned_ BM_BLOCK_SIZE blocks.
				939	*
				940	* called by worker on C_SYNC_TARGET and receiver on SyncSource.
				941	*
				942	*/
				943	void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
				944	const char *file, const unsigned int line)
				945	{
				946	/* Is called from worker and receiver context _only_ */
				947	unsigned long sbnr, ebnr, lbnr;
				948	unsigned long count = 0;
				949	sector_t esector, nr_sectors;
				950	int wake_up = 0;
				951	unsigned long flags;
				952
				953	if (size <= 0 \|\| (size & 0x1ff) != 0 \|\| size > DRBD_MAX_SEGMENT_SIZE) {
				954	dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n",
				955	(unsigned long long)sector, size);
				956	return;
				957	}
				958	nr_sectors = drbd_get_capacity(mdev->this_bdev);
				959	esector = sector + (size >> 9) - 1;
				960
				961	ERR_IF(sector >= nr_sectors) return;
				962	ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
				963
				964	lbnr = BM_SECT_TO_BIT(nr_sectors-1);
				965
				966	/* we clear it (in sync).
				967	* round up start sector, round down end sector. we make sure we only
				968	* clear full, aligned, BM_BLOCK_SIZE (4K) blocks */
				969	if (unlikely(esector < BM_SECT_PER_BIT-1))
				970	return;
				971	if (unlikely(esector == (nr_sectors-1)))
				972	ebnr = lbnr;
				973	else
				974	ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
				975	sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
				976
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	977	if (sbnr > ebnr)
				978	return;
				979
				980	/*
				981	* ok, (capacity & 7) != 0 sometimes, but who cares...
				982	* we count rs_{total,left} in bits, not sectors.
				983	*/
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	984	count = drbd_bm_clear_bits(mdev, sbnr, ebnr);
Lars Ellenberg	1d7734a	2010-08-11 21:21:50 +0200	[diff] [blame]	985	if (count && get_ldev(mdev)) {
				986	unsigned long now = jiffies;
				987	unsigned long last = mdev->rs_mark_time[mdev->rs_last_mark];
				988	int next = (mdev->rs_last_mark + 1) % DRBD_SYNC_MARKS;
				989	if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) {
				990	unsigned long tw = drbd_bm_total_weight(mdev);
				991	if (mdev->rs_mark_left[mdev->rs_last_mark] != tw &&
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	992	mdev->state.conn != C_PAUSED_SYNC_T &&
				993	mdev->state.conn != C_PAUSED_SYNC_S) {
Lars Ellenberg	1d7734a	2010-08-11 21:21:50 +0200	[diff] [blame]	994	mdev->rs_mark_time[next] = now;
				995	mdev->rs_mark_left[next] = tw;
				996	mdev->rs_last_mark = next;
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	997	}
				998	}
Lars Ellenberg	1d7734a	2010-08-11 21:21:50 +0200	[diff] [blame]	999	spin_lock_irqsave(&mdev->al_lock, flags);
				1000	drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE);
				1001	spin_unlock_irqrestore(&mdev->al_lock, flags);
				1002
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	1003	/* just wake_up unconditional now, various lc_chaged(),
				1004	* lc_put() in drbd_try_clear_on_disk_bm(). */
				1005	wake_up = 1;
Lars Ellenberg	1d7734a	2010-08-11 21:21:50 +0200	[diff] [blame]	1006	put_ldev(mdev);
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	1007	}
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	1008	if (wake_up)
				1009	wake_up(&mdev->al_wait);
				1010	}
				1011
				1012	/*
				1013	* this is intended to set one request worth of data out of sync.
				1014	* affects at least 1 bit,
				1015	* and at most 1+DRBD_MAX_SEGMENT_SIZE/BM_BLOCK_SIZE bits.
				1016	*
				1017	* called by tl_clear and drbd_send_dblock (==drbd_make_request).
				1018	* so this can be _any_ process.
				1019	*/
				1020	void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
				1021	const char *file, const unsigned int line)
				1022	{
				1023	unsigned long sbnr, ebnr, lbnr, flags;
				1024	sector_t esector, nr_sectors;
				1025	unsigned int enr, count;
				1026	struct lc_element *e;
				1027
				1028	if (size <= 0 \|\| (size & 0x1ff) != 0 \|\| size > DRBD_MAX_SEGMENT_SIZE) {
				1029	dev_err(DEV, "sector: %llus, size: %d\n",
				1030	(unsigned long long)sector, size);
				1031	return;
				1032	}
				1033
				1034	if (!get_ldev(mdev))
				1035	return; /* no disk, no metadata, no bitmap to set bits in */
				1036
				1037	nr_sectors = drbd_get_capacity(mdev->this_bdev);
				1038	esector = sector + (size >> 9) - 1;
				1039
				1040	ERR_IF(sector >= nr_sectors)
				1041	goto out;
				1042	ERR_IF(esector >= nr_sectors)
				1043	esector = (nr_sectors-1);
				1044
				1045	lbnr = BM_SECT_TO_BIT(nr_sectors-1);
				1046
				1047	/* we set it out of sync,
				1048	* we do not need to round anything here */
				1049	sbnr = BM_SECT_TO_BIT(sector);
				1050	ebnr = BM_SECT_TO_BIT(esector);
				1051
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	1052	/* ok, (capacity & 7) != 0 sometimes, but who cares...
				1053	* we count rs_{total,left} in bits, not sectors. */
				1054	spin_lock_irqsave(&mdev->al_lock, flags);
				1055	count = drbd_bm_set_bits(mdev, sbnr, ebnr);
				1056
				1057	enr = BM_SECT_TO_EXT(sector);
				1058	e = lc_find(mdev->resync, enr);
				1059	if (e)
				1060	lc_entry(e, struct bm_extent, lce)->rs_left += count;
				1061	spin_unlock_irqrestore(&mdev->al_lock, flags);
				1062
				1063	out:
				1064	put_ldev(mdev);
				1065	}
				1066
				1067	static
				1068	struct bm_extent _bme_get(struct drbd_conf mdev, unsigned int enr)
				1069	{
				1070	struct lc_element *e;
				1071	struct bm_extent *bm_ext;
				1072	int wakeup = 0;
				1073	unsigned long rs_flags;
				1074
				1075	spin_lock_irq(&mdev->al_lock);
				1076	if (mdev->resync_locked > mdev->resync->nr_elements/2) {
				1077	spin_unlock_irq(&mdev->al_lock);
				1078	return NULL;
				1079	}
				1080	e = lc_get(mdev->resync, enr);
				1081	bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
				1082	if (bm_ext) {
				1083	if (bm_ext->lce.lc_number != enr) {
				1084	bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
				1085	bm_ext->rs_failed = 0;
				1086	lc_changed(mdev->resync, &bm_ext->lce);
				1087	wakeup = 1;
				1088	}
				1089	if (bm_ext->lce.refcnt == 1)
				1090	mdev->resync_locked++;
				1091	set_bit(BME_NO_WRITES, &bm_ext->flags);
				1092	}
				1093	rs_flags = mdev->resync->flags;
				1094	spin_unlock_irq(&mdev->al_lock);
				1095	if (wakeup)
				1096	wake_up(&mdev->al_wait);
				1097
				1098	if (!bm_ext) {
				1099	if (rs_flags & LC_STARVING)
				1100	dev_warn(DEV, "Have to wait for element"
				1101	" (resync LRU too small?)\n");
				1102	BUG_ON(rs_flags & LC_DIRTY);
				1103	}
				1104
				1105	return bm_ext;
				1106	}
				1107
				1108	static int _is_in_al(struct drbd_conf *mdev, unsigned int enr)
				1109	{
				1110	struct lc_element *al_ext;
				1111	int rv = 0;
				1112
				1113	spin_lock_irq(&mdev->al_lock);
				1114	if (unlikely(enr == mdev->act_log->new_number))
				1115	rv = 1;
				1116	else {
				1117	al_ext = lc_find(mdev->act_log, enr);
				1118	if (al_ext) {
				1119	if (al_ext->refcnt)
				1120	rv = 1;
				1121	}
				1122	}
				1123	spin_unlock_irq(&mdev->al_lock);
				1124
				1125	/*
				1126	if (unlikely(rv)) {
				1127	dev_info(DEV, "Delaying sync read until app's write is done\n");
				1128	}
				1129	*/
				1130	return rv;
				1131	}
				1132
				1133	/**
				1134	* drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED
				1135	* @mdev: DRBD device.
				1136	* @sector: The sector number.
				1137	*
Lars Ellenberg	80a40e4	2010-08-11 23:28:00 +0200	[diff] [blame]	1138	* This functions sleeps on al_wait. Returns 0 on success, -EINTR if interrupted.
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	1139	*/
				1140	int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
				1141	{
				1142	unsigned int enr = BM_SECT_TO_EXT(sector);
				1143	struct bm_extent *bm_ext;
				1144	int i, sig;
				1145
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	1146	sig = wait_event_interruptible(mdev->al_wait,
				1147	(bm_ext = _bme_get(mdev, enr)));
				1148	if (sig)
Lars Ellenberg	80a40e4	2010-08-11 23:28:00 +0200	[diff] [blame]	1149	return -EINTR;
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	1150
				1151	if (test_bit(BME_LOCKED, &bm_ext->flags))
Lars Ellenberg	80a40e4	2010-08-11 23:28:00 +0200	[diff] [blame]	1152	return 0;
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	1153
				1154	for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
				1155	sig = wait_event_interruptible(mdev->al_wait,
				1156	!_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i));
				1157	if (sig) {
				1158	spin_lock_irq(&mdev->al_lock);
				1159	if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
				1160	clear_bit(BME_NO_WRITES, &bm_ext->flags);
				1161	mdev->resync_locked--;
				1162	wake_up(&mdev->al_wait);
				1163	}
				1164	spin_unlock_irq(&mdev->al_lock);
Lars Ellenberg	80a40e4	2010-08-11 23:28:00 +0200	[diff] [blame]	1165	return -EINTR;
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	1166	}
				1167	}
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	1168	set_bit(BME_LOCKED, &bm_ext->flags);
Lars Ellenberg	80a40e4	2010-08-11 23:28:00 +0200	[diff] [blame]	1169	return 0;
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	1170	}
				1171
				1172	/**
				1173	* drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep
				1174	* @mdev: DRBD device.
				1175	* @sector: The sector number.
				1176	*
				1177	* Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then
				1178	* tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN
				1179	* if there is still application IO going on in this area.
				1180	*/
				1181	int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
				1182	{
				1183	unsigned int enr = BM_SECT_TO_EXT(sector);
				1184	const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT;
				1185	struct lc_element *e;
				1186	struct bm_extent *bm_ext;
				1187	int i;
				1188
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	1189	spin_lock_irq(&mdev->al_lock);
				1190	if (mdev->resync_wenr != LC_FREE && mdev->resync_wenr != enr) {
				1191	/* in case you have very heavy scattered io, it may
				1192	* stall the syncer undefined if we give up the ref count
				1193	* when we try again and requeue.
				1194	*
				1195	* if we don't give up the refcount, but the next time
				1196	* we are scheduled this extent has been "synced" by new
				1197	* application writes, we'd miss the lc_put on the
				1198	* extent we keep the refcount on.
				1199	* so we remembered which extent we had to try again, and
				1200	* if the next requested one is something else, we do
				1201	* the lc_put here...
				1202	* we also have to wake_up
				1203	*/
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	1204	e = lc_find(mdev->resync, mdev->resync_wenr);
				1205	bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
				1206	if (bm_ext) {
				1207	D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
				1208	D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags));
				1209	clear_bit(BME_NO_WRITES, &bm_ext->flags);
				1210	mdev->resync_wenr = LC_FREE;
				1211	if (lc_put(mdev->resync, &bm_ext->lce) == 0)
				1212	mdev->resync_locked--;
				1213	wake_up(&mdev->al_wait);
				1214	} else {
				1215	dev_alert(DEV, "LOGIC BUG\n");
				1216	}
				1217	}
				1218	/* TRY. */
				1219	e = lc_try_get(mdev->resync, enr);
				1220	bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
				1221	if (bm_ext) {
				1222	if (test_bit(BME_LOCKED, &bm_ext->flags))
				1223	goto proceed;
				1224	if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) {
				1225	mdev->resync_locked++;
				1226	} else {
				1227	/* we did set the BME_NO_WRITES,
				1228	* but then could not set BME_LOCKED,
				1229	* so we tried again.
				1230	* drop the extra reference. */
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	1231	bm_ext->lce.refcnt--;
				1232	D_ASSERT(bm_ext->lce.refcnt > 0);
				1233	}
				1234	goto check_al;
				1235	} else {
				1236	/* do we rather want to try later? */
Jens Axboe	6a0afdf	2009-10-01 09:04:14 +0200	[diff] [blame]	1237	if (mdev->resync_locked > mdev->resync->nr_elements-3)
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	1238	goto try_again;
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	1239	/* Do or do not. There is no try. -- Yoda */
				1240	e = lc_get(mdev->resync, enr);
				1241	bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
				1242	if (!bm_ext) {
				1243	const unsigned long rs_flags = mdev->resync->flags;
				1244	if (rs_flags & LC_STARVING)
				1245	dev_warn(DEV, "Have to wait for element"
				1246	" (resync LRU too small?)\n");
				1247	BUG_ON(rs_flags & LC_DIRTY);
				1248	goto try_again;
				1249	}
				1250	if (bm_ext->lce.lc_number != enr) {
				1251	bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
				1252	bm_ext->rs_failed = 0;
				1253	lc_changed(mdev->resync, &bm_ext->lce);
				1254	wake_up(&mdev->al_wait);
				1255	D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0);
				1256	}
				1257	set_bit(BME_NO_WRITES, &bm_ext->flags);
				1258	D_ASSERT(bm_ext->lce.refcnt == 1);
				1259	mdev->resync_locked++;
				1260	goto check_al;
				1261	}
				1262	check_al:
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	1263	for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
				1264	if (unlikely(al_enr+i == mdev->act_log->new_number))
				1265	goto try_again;
				1266	if (lc_is_used(mdev->act_log, al_enr+i))
				1267	goto try_again;
				1268	}
				1269	set_bit(BME_LOCKED, &bm_ext->flags);
				1270	proceed:
				1271	mdev->resync_wenr = LC_FREE;
				1272	spin_unlock_irq(&mdev->al_lock);
				1273	return 0;
				1274
				1275	try_again:
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	1276	if (bm_ext)
				1277	mdev->resync_wenr = enr;
				1278	spin_unlock_irq(&mdev->al_lock);
				1279	return -EAGAIN;
				1280	}
				1281
				1282	void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector)
				1283	{
				1284	unsigned int enr = BM_SECT_TO_EXT(sector);
				1285	struct lc_element *e;
				1286	struct bm_extent *bm_ext;
				1287	unsigned long flags;
				1288
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	1289	spin_lock_irqsave(&mdev->al_lock, flags);
				1290	e = lc_find(mdev->resync, enr);
				1291	bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
				1292	if (!bm_ext) {
				1293	spin_unlock_irqrestore(&mdev->al_lock, flags);
				1294	if (__ratelimit(&drbd_ratelimit_state))
				1295	dev_err(DEV, "drbd_rs_complete_io() called, but extent not found\n");
				1296	return;
				1297	}
				1298
				1299	if (bm_ext->lce.refcnt == 0) {
				1300	spin_unlock_irqrestore(&mdev->al_lock, flags);
				1301	dev_err(DEV, "drbd_rs_complete_io(,%llu [=%u]) called, "
				1302	"but refcnt is 0!?\n",
				1303	(unsigned long long)sector, enr);
				1304	return;
				1305	}
				1306
				1307	if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
				1308	clear_bit(BME_LOCKED, &bm_ext->flags);
				1309	clear_bit(BME_NO_WRITES, &bm_ext->flags);
				1310	mdev->resync_locked--;
				1311	wake_up(&mdev->al_wait);
				1312	}
				1313
				1314	spin_unlock_irqrestore(&mdev->al_lock, flags);
				1315	}
				1316
				1317	/**
				1318	* drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED)
				1319	* @mdev: DRBD device.
				1320	*/
				1321	void drbd_rs_cancel_all(struct drbd_conf *mdev)
				1322	{
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	1323	spin_lock_irq(&mdev->al_lock);
				1324
				1325	if (get_ldev_if_state(mdev, D_FAILED)) { /* Makes sure ->resync is there. */
				1326	lc_reset(mdev->resync);
				1327	put_ldev(mdev);
				1328	}
				1329	mdev->resync_locked = 0;
				1330	mdev->resync_wenr = LC_FREE;
				1331	spin_unlock_irq(&mdev->al_lock);
				1332	wake_up(&mdev->al_wait);
				1333	}
				1334
				1335	/**
				1336	* drbd_rs_del_all() - Gracefully remove all extents from the resync LRU
				1337	* @mdev: DRBD device.
				1338	*
				1339	* Returns 0 upon success, -EAGAIN if at least one reference count was
				1340	* not zero.
				1341	*/
				1342	int drbd_rs_del_all(struct drbd_conf *mdev)
				1343	{
				1344	struct lc_element *e;
				1345	struct bm_extent *bm_ext;
				1346	int i;
				1347
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	1348	spin_lock_irq(&mdev->al_lock);
				1349
				1350	if (get_ldev_if_state(mdev, D_FAILED)) {
				1351	/* ok, ->resync is there. */
				1352	for (i = 0; i < mdev->resync->nr_elements; i++) {
				1353	e = lc_element_by_index(mdev->resync, i);
Philipp Reisner	b2b163d	2010-04-02 08:40:33 +0200	[diff] [blame]	1354	bm_ext = lc_entry(e, struct bm_extent, lce);
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	1355	if (bm_ext->lce.lc_number == LC_FREE)
				1356	continue;
				1357	if (bm_ext->lce.lc_number == mdev->resync_wenr) {
				1358	dev_info(DEV, "dropping %u in drbd_rs_del_all, apparently"
				1359	" got 'synced' by application io\n",
				1360	mdev->resync_wenr);
				1361	D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
				1362	D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags));
				1363	clear_bit(BME_NO_WRITES, &bm_ext->flags);
				1364	mdev->resync_wenr = LC_FREE;
				1365	lc_put(mdev->resync, &bm_ext->lce);
				1366	}
				1367	if (bm_ext->lce.refcnt != 0) {
				1368	dev_info(DEV, "Retrying drbd_rs_del_all() later. "
				1369	"refcnt=%d\n", bm_ext->lce.refcnt);
				1370	put_ldev(mdev);
				1371	spin_unlock_irq(&mdev->al_lock);
				1372	return -EAGAIN;
				1373	}
				1374	D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
				1375	D_ASSERT(!test_bit(BME_NO_WRITES, &bm_ext->flags));
				1376	lc_del(mdev->resync, &bm_ext->lce);
				1377	}
				1378	D_ASSERT(mdev->resync->used == 0);
				1379	put_ldev(mdev);
				1380	}
				1381	spin_unlock_irq(&mdev->al_lock);
				1382
				1383	return 0;
				1384	}
				1385
				1386	/**
				1387	* drbd_rs_failed_io() - Record information on a failure to resync the specified blocks
				1388	* @mdev: DRBD device.
				1389	* @sector: The sector number.
				1390	* @size: Size of failed IO operation, in byte.
				1391	*/
				1392	void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size)
				1393	{
				1394	/* Is called from worker and receiver context _only_ */
				1395	unsigned long sbnr, ebnr, lbnr;
				1396	unsigned long count;
				1397	sector_t esector, nr_sectors;
				1398	int wake_up = 0;
				1399
Philipp Reisner	b411b36	2009-09-25 16:07:19 -0700	[diff] [blame]	1400	if (size <= 0 \|\| (size & 0x1ff) != 0 \|\| size > DRBD_MAX_SEGMENT_SIZE) {
				1401	dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",
				1402	(unsigned long long)sector, size);
				1403	return;
				1404	}
				1405	nr_sectors = drbd_get_capacity(mdev->this_bdev);
				1406	esector = sector + (size >> 9) - 1;
				1407
				1408	ERR_IF(sector >= nr_sectors) return;
				1409	ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
				1410
				1411	lbnr = BM_SECT_TO_BIT(nr_sectors-1);
				1412
				1413	/*
				1414	* round up start sector, round down end sector. we make sure we only
				1415	* handle full, aligned, BM_BLOCK_SIZE (4K) blocks */
				1416	if (unlikely(esector < BM_SECT_PER_BIT-1))
				1417	return;
				1418	if (unlikely(esector == (nr_sectors-1)))
				1419	ebnr = lbnr;
				1420	else
				1421	ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
				1422	sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
				1423
				1424	if (sbnr > ebnr)
				1425	return;
				1426
				1427	/*
				1428	* ok, (capacity & 7) != 0 sometimes, but who cares...
				1429	* we count rs_{total,left} in bits, not sectors.
				1430	*/
				1431	spin_lock_irq(&mdev->al_lock);
				1432	count = drbd_bm_count_bits(mdev, sbnr, ebnr);
				1433	if (count) {
				1434	mdev->rs_failed += count;
				1435
				1436	if (get_ldev(mdev)) {
				1437	drbd_try_clear_on_disk_bm(mdev, sector, count, FALSE);
				1438	put_ldev(mdev);
				1439	}
				1440
				1441	/* just wake_up unconditional now, various lc_chaged(),
				1442	* lc_put() in drbd_try_clear_on_disk_bm(). */
				1443	wake_up = 1;
				1444	}
				1445	spin_unlock_irq(&mdev->al_lock);
				1446	if (wake_up)
				1447	wake_up(&mdev->al_wait);
				1448	}