Blame - fs/btrfs/reada.c - kernel/msm-4.9

blob: a955669519a265bbfb5f13de4723c87932f8047e [file] [log] [blame]

Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	1	/*
				2	* Copyright (C) 2011 STRATO. All rights reserved.
				3	*
				4	* This program is free software; you can redistribute it and/or
				5	* modify it under the terms of the GNU General Public
				6	* License v2 as published by the Free Software Foundation.
				7	*
				8	* This program is distributed in the hope that it will be useful,
				9	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				11	* General Public License for more details.
				12	*
				13	* You should have received a copy of the GNU General Public
				14	* License along with this program; if not, write to the
				15	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				16	* Boston, MA 021110-1307, USA.
				17	*/
				18
				19	#include <linux/sched.h>
				20	#include <linux/pagemap.h>
				21	#include <linux/writeback.h>
				22	#include <linux/blkdev.h>
				23	#include <linux/rbtree.h>
				24	#include <linux/slab.h>
				25	#include <linux/workqueue.h>
				26	#include "ctree.h"
				27	#include "volumes.h"
				28	#include "disk-io.h"
				29	#include "transaction.h"
				30
				31	#undef DEBUG
				32
				33	/*
				34	* This is the implementation for the generic read ahead framework.
				35	*
				36	* To trigger a readahead, btrfs_reada_add must be called. It will start
				37	* a read ahead for the given range [start, end) on tree root. The returned
				38	* handle can either be used to wait on the readahead to finish
				39	* (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach).
				40	*
				41	* The read ahead works as follows:
				42	* On btrfs_reada_add, the root of the tree is inserted into a radix_tree.
				43	* reada_start_machine will then search for extents to prefetch and trigger
				44	* some reads. When a read finishes for a node, all contained node/leaf
				45	* pointers that lie in the given range will also be enqueued. The reads will
				46	* be triggered in sequential order, thus giving a big win over a naive
				47	* enumeration. It will also make use of multi-device layouts. Each disk
				48	* will have its on read pointer and all disks will by utilized in parallel.
				49	* Also will no two disks read both sides of a mirror simultaneously, as this
				50	* would waste seeking capacity. Instead both disks will read different parts
				51	* of the filesystem.
				52	* Any number of readaheads can be started in parallel. The read order will be
				53	* determined globally, i.e. 2 parallel readaheads will normally finish faster
				54	* than the 2 started one after another.
				55	*/
				56
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	57	#define MAX_IN_FLIGHT 6
				58
				59	struct reada_extctl {
				60	struct list_head list;
				61	struct reada_control *rc;
				62	u64 generation;
				63	};
				64
				65	struct reada_extent {
				66	u64 logical;
				67	struct btrfs_key top;
				68	u32 blocksize;
				69	int err;
				70	struct list_head extctl;
Al Viro	99621b4	2012-08-29 16:31:33 -0400	[diff] [blame]	71	int refcnt;
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	72	spinlock_t lock;
Stefan Behrens	94598ba	2012-03-27 14:21:26 -0400	[diff] [blame]	73	struct reada_zone *zones[BTRFS_MAX_MIRRORS];
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	74	int nzones;
				75	struct btrfs_device *scheduled_for;
				76	};
				77
				78	struct reada_zone {
				79	u64 start;
				80	u64 end;
				81	u64 elems;
				82	struct list_head list;
				83	spinlock_t lock;
				84	int locked;
				85	struct btrfs_device *device;
Stefan Behrens	94598ba	2012-03-27 14:21:26 -0400	[diff] [blame]	86	struct btrfs_device devs[BTRFS_MAX_MIRRORS]; / full list, incl
				87	* self */
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	88	int ndevs;
				89	struct kref refcnt;
				90	};
				91
				92	struct reada_machine_work {
				93	struct btrfs_work work;
				94	struct btrfs_fs_info *fs_info;
				95	};
				96
				97	static void reada_extent_put(struct btrfs_fs_info , struct reada_extent );
				98	static void reada_control_release(struct kref *kref);
				99	static void reada_zone_release(struct kref *kref);
				100	static void reada_start_machine(struct btrfs_fs_info *fs_info);
				101	static void __reada_start_machine(struct btrfs_fs_info *fs_info);
				102
				103	static int reada_add_block(struct reada_control *rc, u64 logical,
				104	struct btrfs_key *top, int level, u64 generation);
				105
				106	/* recurses */
				107	/* in case of err, eb might be NULL */
				108	static int __readahead_hook(struct btrfs_root root, struct extent_buffer eb,
				109	u64 start, int err)
				110	{
				111	int level = 0;
				112	int nritems;
				113	int i;
				114	u64 bytenr;
				115	u64 generation;
				116	struct reada_extent *re;
				117	struct btrfs_fs_info *fs_info = root->fs_info;
				118	struct list_head list;
				119	unsigned long index = start >> PAGE_CACHE_SHIFT;
				120	struct btrfs_device *for_dev;
				121
				122	if (eb)
				123	level = btrfs_header_level(eb);
				124
				125	/* find extent */
				126	spin_lock(&fs_info->reada_lock);
				127	re = radix_tree_lookup(&fs_info->reada_tree, index);
				128	if (re)
Al Viro	99621b4	2012-08-29 16:31:33 -0400	[diff] [blame]	129	re->refcnt++;
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	130	spin_unlock(&fs_info->reada_lock);
				131
				132	if (!re)
				133	return -1;
				134
				135	spin_lock(&re->lock);
				136	/*
				137	* just take the full list from the extent. afterwards we
				138	* don't need the lock anymore
				139	*/
				140	list_replace_init(&re->extctl, &list);
				141	for_dev = re->scheduled_for;
				142	re->scheduled_for = NULL;
				143	spin_unlock(&re->lock);
				144
				145	if (err == 0) {
				146	nritems = level ? btrfs_header_nritems(eb) : 0;
				147	generation = btrfs_header_generation(eb);
				148	/*
				149	* FIXME: currently we just set nritems to 0 if this is a leaf,
				150	* effectively ignoring the content. In a next step we could
				151	* trigger more readahead depending from the content, e.g.
				152	* fetch the checksums for the extents in the leaf.
				153	*/
				154	} else {
				155	/*
				156	* this is the error case, the extent buffer has not been
				157	* read correctly. We won't access anything from it and
				158	* just cleanup our data structures. Effectively this will
				159	* cut the branch below this node from read ahead.
				160	*/
				161	nritems = 0;
				162	generation = 0;
				163	}
				164
				165	for (i = 0; i < nritems; i++) {
				166	struct reada_extctl *rec;
				167	u64 n_gen;
				168	struct btrfs_key key;
				169	struct btrfs_key next_key;
				170
				171	btrfs_node_key_to_cpu(eb, &key, i);
				172	if (i + 1 < nritems)
				173	btrfs_node_key_to_cpu(eb, &next_key, i + 1);
				174	else
				175	next_key = re->top;
				176	bytenr = btrfs_node_blockptr(eb, i);
				177	n_gen = btrfs_node_ptr_generation(eb, i);
				178
				179	list_for_each_entry(rec, &list, list) {
				180	struct reada_control *rc = rec->rc;
				181
				182	/*
				183	* if the generation doesn't match, just ignore this
				184	* extctl. This will probably cut off a branch from
				185	* prefetch. Alternatively one could start a new (sub-)
				186	* prefetch for this branch, starting again from root.
				187	* FIXME: move the generation check out of this loop
				188	*/
				189	#ifdef DEBUG
				190	if (rec->generation != generation) {
				191	printk(KERN_DEBUG "generation mismatch for "
				192	"(%llu,%d,%llu) %llu != %llu\n",
				193	key.objectid, key.type, key.offset,
				194	rec->generation, generation);
				195	}
				196	#endif
				197	if (rec->generation == generation &&
				198	btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
				199	btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
				200	reada_add_block(rc, bytenr, &next_key,
				201	level - 1, n_gen);
				202	}
				203	}
				204	/*
				205	* free extctl records
				206	*/
				207	while (!list_empty(&list)) {
				208	struct reada_control *rc;
				209	struct reada_extctl *rec;
				210
				211	rec = list_first_entry(&list, struct reada_extctl, list);
				212	list_del(&rec->list);
				213	rc = rec->rc;
				214	kfree(rec);
				215
				216	kref_get(&rc->refcnt);
				217	if (atomic_dec_and_test(&rc->elems)) {
				218	kref_put(&rc->refcnt, reada_control_release);
				219	wake_up(&rc->wait);
				220	}
				221	kref_put(&rc->refcnt, reada_control_release);
				222
				223	reada_extent_put(fs_info, re); /* one ref for each entry */
				224	}
				225	reada_extent_put(fs_info, re); /* our ref */
				226	if (for_dev)
				227	atomic_dec(&for_dev->reada_in_flight);
				228
				229	return 0;
				230	}
				231
				232	/*
				233	* start is passed separately in case eb in NULL, which may be the case with
				234	* failed I/O
				235	*/
				236	int btree_readahead_hook(struct btrfs_root root, struct extent_buffer eb,
				237	u64 start, int err)
				238	{
				239	int ret;
				240
				241	ret = __readahead_hook(root, eb, start, err);
				242
				243	reada_start_machine(root->fs_info);
				244
				245	return ret;
				246	}
				247
				248	static struct reada_zone reada_find_zone(struct btrfs_fs_info fs_info,
				249	struct btrfs_device *dev, u64 logical,
Ilya Dryomov	21ca543	2011-11-04 09:41:02 -0400	[diff] [blame]	250	struct btrfs_bio *bbio)
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	251	{
				252	int ret;
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	253	struct reada_zone *zone;
				254	struct btrfs_block_group_cache *cache = NULL;
				255	u64 start;
				256	u64 end;
				257	int i;
				258
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	259	zone = NULL;
				260	spin_lock(&fs_info->reada_lock);
				261	ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
				262	logical >> PAGE_CACHE_SHIFT, 1);
				263	if (ret == 1)
				264	kref_get(&zone->refcnt);
				265	spin_unlock(&fs_info->reada_lock);
				266
				267	if (ret == 1) {
				268	if (logical >= zone->start && logical < zone->end)
				269	return zone;
				270	spin_lock(&fs_info->reada_lock);
				271	kref_put(&zone->refcnt, reada_zone_release);
				272	spin_unlock(&fs_info->reada_lock);
				273	}
				274
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	275	cache = btrfs_lookup_block_group(fs_info, logical);
				276	if (!cache)
				277	return NULL;
				278
				279	start = cache->key.objectid;
				280	end = start + cache->key.offset - 1;
				281	btrfs_put_block_group(cache);
				282
				283	zone = kzalloc(sizeof(*zone), GFP_NOFS);
				284	if (!zone)
				285	return NULL;
				286
				287	zone->start = start;
				288	zone->end = end;
				289	INIT_LIST_HEAD(&zone->list);
				290	spin_lock_init(&zone->lock);
				291	zone->locked = 0;
				292	kref_init(&zone->refcnt);
				293	zone->elems = 0;
				294	zone->device = dev; /* our device always sits at index 0 */
Ilya Dryomov	21ca543	2011-11-04 09:41:02 -0400	[diff] [blame]	295	for (i = 0; i < bbio->num_stripes; ++i) {
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	296	/* bounds have already been checked */
Ilya Dryomov	21ca543	2011-11-04 09:41:02 -0400	[diff] [blame]	297	zone->devs[i] = bbio->stripes[i].dev;
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	298	}
Ilya Dryomov	21ca543	2011-11-04 09:41:02 -0400	[diff] [blame]	299	zone->ndevs = bbio->num_stripes;
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	300
				301	spin_lock(&fs_info->reada_lock);
				302	ret = radix_tree_insert(&dev->reada_zones,
Chris Mason	a175423	2012-02-28 12:42:44 -0500	[diff] [blame]	303	(unsigned long)(zone->end >> PAGE_CACHE_SHIFT),
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	304	zone);
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	305
Arne Jansen	8c9c2bf	2012-02-25 09:09:30 +0100	[diff] [blame]	306	if (ret == -EEXIST) {
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	307	kfree(zone);
Arne Jansen	8c9c2bf	2012-02-25 09:09:30 +0100	[diff] [blame]	308	ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
				309	logical >> PAGE_CACHE_SHIFT, 1);
				310	if (ret == 1)
				311	kref_get(&zone->refcnt);
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	312	}
Arne Jansen	8c9c2bf	2012-02-25 09:09:30 +0100	[diff] [blame]	313	spin_unlock(&fs_info->reada_lock);
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	314
				315	return zone;
				316	}
				317
				318	static struct reada_extent reada_find_extent(struct btrfs_root root,
				319	u64 logical,
				320	struct btrfs_key *top, int level)
				321	{
				322	int ret;
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	323	struct reada_extent *re = NULL;
Arne Jansen	8c9c2bf	2012-02-25 09:09:30 +0100	[diff] [blame]	324	struct reada_extent *re_exist = NULL;
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	325	struct btrfs_fs_info *fs_info = root->fs_info;
				326	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
Ilya Dryomov	21ca543	2011-11-04 09:41:02 -0400	[diff] [blame]	327	struct btrfs_bio *bbio = NULL;
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	328	struct btrfs_device *dev;
Arne Jansen	207a232	2012-02-25 09:09:47 +0100	[diff] [blame]	329	struct btrfs_device *prev_dev;
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	330	u32 blocksize;
				331	u64 length;
				332	int nzones = 0;
				333	int i;
				334	unsigned long index = logical >> PAGE_CACHE_SHIFT;
				335
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	336	spin_lock(&fs_info->reada_lock);
				337	re = radix_tree_lookup(&fs_info->reada_tree, index);
				338	if (re)
Al Viro	99621b4	2012-08-29 16:31:33 -0400	[diff] [blame]	339	re->refcnt++;
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	340	spin_unlock(&fs_info->reada_lock);
				341
Arne Jansen	8c9c2bf	2012-02-25 09:09:30 +0100	[diff] [blame]	342	if (re)
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	343	return re;
				344
				345	re = kzalloc(sizeof(*re), GFP_NOFS);
				346	if (!re)
				347	return NULL;
				348
				349	blocksize = btrfs_level_size(root, level);
				350	re->logical = logical;
				351	re->blocksize = blocksize;
				352	re->top = *top;
				353	INIT_LIST_HEAD(&re->extctl);
				354	spin_lock_init(&re->lock);
Al Viro	99621b4	2012-08-29 16:31:33 -0400	[diff] [blame]	355	re->refcnt = 1;
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	356
				357	/*
				358	* map block
				359	*/
				360	length = blocksize;
Ilya Dryomov	21ca543	2011-11-04 09:41:02 -0400	[diff] [blame]	361	ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0);
				362	if (ret \|\| !bbio \|\| length < blocksize)
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	363	goto error;
				364
Stefan Behrens	94598ba	2012-03-27 14:21:26 -0400	[diff] [blame]	365	if (bbio->num_stripes > BTRFS_MAX_MIRRORS) {
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	366	printk(KERN_ERR "btrfs readahead: more than %d copies not "
Stefan Behrens	94598ba	2012-03-27 14:21:26 -0400	[diff] [blame]	367	"supported", BTRFS_MAX_MIRRORS);
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	368	goto error;
				369	}
				370
Ilya Dryomov	21ca543	2011-11-04 09:41:02 -0400	[diff] [blame]	371	for (nzones = 0; nzones < bbio->num_stripes; ++nzones) {
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	372	struct reada_zone *zone;
				373
Ilya Dryomov	21ca543	2011-11-04 09:41:02 -0400	[diff] [blame]	374	dev = bbio->stripes[nzones].dev;
				375	zone = reada_find_zone(fs_info, dev, logical, bbio);
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	376	if (!zone)
				377	break;
				378
				379	re->zones[nzones] = zone;
				380	spin_lock(&zone->lock);
				381	if (!zone->elems)
				382	kref_get(&zone->refcnt);
				383	++zone->elems;
				384	spin_unlock(&zone->lock);
				385	spin_lock(&fs_info->reada_lock);
				386	kref_put(&zone->refcnt, reada_zone_release);
				387	spin_unlock(&fs_info->reada_lock);
				388	}
				389	re->nzones = nzones;
				390	if (nzones == 0) {
				391	/* not a single zone found, error and out */
				392	goto error;
				393	}
				394
				395	/* insert extent in reada_tree + all per-device trees, all or nothing */
				396	spin_lock(&fs_info->reada_lock);
				397	ret = radix_tree_insert(&fs_info->reada_tree, index, re);
Arne Jansen	8c9c2bf	2012-02-25 09:09:30 +0100	[diff] [blame]	398	if (ret == -EEXIST) {
				399	re_exist = radix_tree_lookup(&fs_info->reada_tree, index);
				400	BUG_ON(!re_exist);
Al Viro	99621b4	2012-08-29 16:31:33 -0400	[diff] [blame]	401	re_exist->refcnt++;
Arne Jansen	8c9c2bf	2012-02-25 09:09:30 +0100	[diff] [blame]	402	spin_unlock(&fs_info->reada_lock);
				403	goto error;
				404	}
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	405	if (ret) {
				406	spin_unlock(&fs_info->reada_lock);
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	407	goto error;
				408	}
Arne Jansen	207a232	2012-02-25 09:09:47 +0100	[diff] [blame]	409	prev_dev = NULL;
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	410	for (i = 0; i < nzones; ++i) {
Ilya Dryomov	21ca543	2011-11-04 09:41:02 -0400	[diff] [blame]	411	dev = bbio->stripes[i].dev;
Arne Jansen	207a232	2012-02-25 09:09:47 +0100	[diff] [blame]	412	if (dev == prev_dev) {
				413	/*
				414	* in case of DUP, just add the first zone. As both
				415	* are on the same device, there's nothing to gain
				416	* from adding both.
				417	* Also, it wouldn't work, as the tree is per device
				418	* and adding would fail with EEXIST
				419	*/
				420	continue;
				421	}
				422	prev_dev = dev;
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	423	ret = radix_tree_insert(&dev->reada_extents, index, re);
				424	if (ret) {
				425	while (--i >= 0) {
Ilya Dryomov	21ca543	2011-11-04 09:41:02 -0400	[diff] [blame]	426	dev = bbio->stripes[i].dev;
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	427	BUG_ON(dev == NULL);
				428	radix_tree_delete(&dev->reada_extents, index);
				429	}
				430	BUG_ON(fs_info == NULL);
				431	radix_tree_delete(&fs_info->reada_tree, index);
				432	spin_unlock(&fs_info->reada_lock);
				433	goto error;
				434	}
				435	}
				436	spin_unlock(&fs_info->reada_lock);
				437
Ilya Dryomov	21ca543	2011-11-04 09:41:02 -0400	[diff] [blame]	438	kfree(bbio);
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	439	return re;
				440
				441	error:
				442	while (nzones) {
				443	struct reada_zone *zone;
				444
				445	--nzones;
				446	zone = re->zones[nzones];
				447	kref_get(&zone->refcnt);
				448	spin_lock(&zone->lock);
				449	--zone->elems;
				450	if (zone->elems == 0) {
				451	/*
				452	* no fs_info->reada_lock needed, as this can't be
				453	* the last ref
				454	*/
				455	kref_put(&zone->refcnt, reada_zone_release);
				456	}
				457	spin_unlock(&zone->lock);
				458
				459	spin_lock(&fs_info->reada_lock);
				460	kref_put(&zone->refcnt, reada_zone_release);
				461	spin_unlock(&fs_info->reada_lock);
				462	}
Ilya Dryomov	21ca543	2011-11-04 09:41:02 -0400	[diff] [blame]	463	kfree(bbio);
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	464	kfree(re);
Arne Jansen	8c9c2bf	2012-02-25 09:09:30 +0100	[diff] [blame]	465	return re_exist;
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	466	}
				467
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	468	static void reada_extent_put(struct btrfs_fs_info *fs_info,
				469	struct reada_extent *re)
				470	{
				471	int i;
				472	unsigned long index = re->logical >> PAGE_CACHE_SHIFT;
				473
				474	spin_lock(&fs_info->reada_lock);
Al Viro	99621b4	2012-08-29 16:31:33 -0400	[diff] [blame]	475	if (--re->refcnt) {
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	476	spin_unlock(&fs_info->reada_lock);
				477	return;
				478	}
				479
				480	radix_tree_delete(&fs_info->reada_tree, index);
				481	for (i = 0; i < re->nzones; ++i) {
				482	struct reada_zone *zone = re->zones[i];
				483
				484	radix_tree_delete(&zone->device->reada_extents, index);
				485	}
				486
				487	spin_unlock(&fs_info->reada_lock);
				488
				489	for (i = 0; i < re->nzones; ++i) {
				490	struct reada_zone *zone = re->zones[i];
				491
				492	kref_get(&zone->refcnt);
				493	spin_lock(&zone->lock);
				494	--zone->elems;
				495	if (zone->elems == 0) {
				496	/* no fs_info->reada_lock needed, as this can't be
				497	* the last ref */
				498	kref_put(&zone->refcnt, reada_zone_release);
				499	}
				500	spin_unlock(&zone->lock);
				501
				502	spin_lock(&fs_info->reada_lock);
				503	kref_put(&zone->refcnt, reada_zone_release);
				504	spin_unlock(&fs_info->reada_lock);
				505	}
				506	if (re->scheduled_for)
				507	atomic_dec(&re->scheduled_for->reada_in_flight);
				508
				509	kfree(re);
				510	}
				511
				512	static void reada_zone_release(struct kref *kref)
				513	{
				514	struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
				515
				516	radix_tree_delete(&zone->device->reada_zones,
				517	zone->end >> PAGE_CACHE_SHIFT);
				518
				519	kfree(zone);
				520	}
				521
				522	static void reada_control_release(struct kref *kref)
				523	{
				524	struct reada_control *rc = container_of(kref, struct reada_control,
				525	refcnt);
				526
				527	kfree(rc);
				528	}
				529
				530	static int reada_add_block(struct reada_control *rc, u64 logical,
				531	struct btrfs_key *top, int level, u64 generation)
				532	{
				533	struct btrfs_root *root = rc->root;
				534	struct reada_extent *re;
				535	struct reada_extctl *rec;
				536
				537	re = reada_find_extent(root, logical, top, level); /* takes one ref */
				538	if (!re)
				539	return -1;
				540
				541	rec = kzalloc(sizeof(*rec), GFP_NOFS);
				542	if (!rec) {
				543	reada_extent_put(root->fs_info, re);
				544	return -1;
				545	}
				546
				547	rec->rc = rc;
				548	rec->generation = generation;
				549	atomic_inc(&rc->elems);
				550
				551	spin_lock(&re->lock);
				552	list_add_tail(&rec->list, &re->extctl);
				553	spin_unlock(&re->lock);
				554
				555	/* leave the ref on the extent */
				556
				557	return 0;
				558	}
				559
				560	/*
				561	* called with fs_info->reada_lock held
				562	*/
				563	static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
				564	{
				565	int i;
				566	unsigned long index = zone->end >> PAGE_CACHE_SHIFT;
				567
				568	for (i = 0; i < zone->ndevs; ++i) {
				569	struct reada_zone *peer;
				570	peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index);
				571	if (peer && peer->device != zone->device)
				572	peer->locked = lock;
				573	}
				574	}
				575
				576	/*
				577	* called with fs_info->reada_lock held
				578	*/
				579	static int reada_pick_zone(struct btrfs_device *dev)
				580	{
				581	struct reada_zone *top_zone = NULL;
				582	struct reada_zone *top_locked_zone = NULL;
				583	u64 top_elems = 0;
				584	u64 top_locked_elems = 0;
				585	unsigned long index = 0;
				586	int ret;
				587
				588	if (dev->reada_curr_zone) {
				589	reada_peer_zones_set_lock(dev->reada_curr_zone, 0);
				590	kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release);
				591	dev->reada_curr_zone = NULL;
				592	}
				593	/* pick the zone with the most elements */
				594	while (1) {
				595	struct reada_zone *zone;
				596
				597	ret = radix_tree_gang_lookup(&dev->reada_zones,
				598	(void **)&zone, index, 1);
				599	if (ret == 0)
				600	break;
				601	index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
				602	if (zone->locked) {
				603	if (zone->elems > top_locked_elems) {
				604	top_locked_elems = zone->elems;
				605	top_locked_zone = zone;
				606	}
				607	} else {
				608	if (zone->elems > top_elems) {
				609	top_elems = zone->elems;
				610	top_zone = zone;
				611	}
				612	}
				613	}
				614	if (top_zone)
				615	dev->reada_curr_zone = top_zone;
				616	else if (top_locked_zone)
				617	dev->reada_curr_zone = top_locked_zone;
				618	else
				619	return 0;
				620
				621	dev->reada_next = dev->reada_curr_zone->start;
				622	kref_get(&dev->reada_curr_zone->refcnt);
				623	reada_peer_zones_set_lock(dev->reada_curr_zone, 1);
				624
				625	return 1;
				626	}
				627
				628	static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
				629	struct btrfs_device *dev)
				630	{
				631	struct reada_extent *re = NULL;
				632	int mirror_num = 0;
				633	struct extent_buffer *eb = NULL;
				634	u64 logical;
				635	u32 blocksize;
				636	int ret;
				637	int i;
				638	int need_kick = 0;
				639
				640	spin_lock(&fs_info->reada_lock);
				641	if (dev->reada_curr_zone == NULL) {
				642	ret = reada_pick_zone(dev);
				643	if (!ret) {
				644	spin_unlock(&fs_info->reada_lock);
				645	return 0;
				646	}
				647	}
				648	/*
				649	* FIXME currently we issue the reads one extent at a time. If we have
				650	* a contiguous block of extents, we could also coagulate them or use
				651	* plugging to speed things up
				652	*/
				653	ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
				654	dev->reada_next >> PAGE_CACHE_SHIFT, 1);
				655	if (ret == 0 \|\| re->logical >= dev->reada_curr_zone->end) {
				656	ret = reada_pick_zone(dev);
				657	if (!ret) {
				658	spin_unlock(&fs_info->reada_lock);
				659	return 0;
				660	}
				661	re = NULL;
				662	ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
				663	dev->reada_next >> PAGE_CACHE_SHIFT, 1);
				664	}
				665	if (ret == 0) {
				666	spin_unlock(&fs_info->reada_lock);
				667	return 0;
				668	}
				669	dev->reada_next = re->logical + re->blocksize;
Al Viro	99621b4	2012-08-29 16:31:33 -0400	[diff] [blame]	670	re->refcnt++;
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	671
				672	spin_unlock(&fs_info->reada_lock);
				673
				674	/*
				675	* find mirror num
				676	*/
				677	for (i = 0; i < re->nzones; ++i) {
				678	if (re->zones[i]->device == dev) {
				679	mirror_num = i + 1;
				680	break;
				681	}
				682	}
				683	logical = re->logical;
				684	blocksize = re->blocksize;
				685
				686	spin_lock(&re->lock);
				687	if (re->scheduled_for == NULL) {
				688	re->scheduled_for = dev;
				689	need_kick = 1;
				690	}
				691	spin_unlock(&re->lock);
				692
				693	reada_extent_put(fs_info, re);
				694
				695	if (!need_kick)
				696	return 0;
				697
				698	atomic_inc(&dev->reada_in_flight);
				699	ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize,
				700	mirror_num, &eb);
				701	if (ret)
				702	__readahead_hook(fs_info->extent_root, NULL, logical, ret);
				703	else if (eb)
				704	__readahead_hook(fs_info->extent_root, eb, eb->start, ret);
				705
				706	if (eb)
				707	free_extent_buffer(eb);
				708
				709	return 1;
				710
				711	}
				712
				713	static void reada_start_machine_worker(struct btrfs_work *work)
				714	{
				715	struct reada_machine_work *rmw;
				716	struct btrfs_fs_info *fs_info;
Stefan Behrens	3d136a1	2012-02-03 11:20:04 +0100	[diff] [blame]	717	int old_ioprio;
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	718
				719	rmw = container_of(work, struct reada_machine_work, work);
				720	fs_info = rmw->fs_info;
				721
				722	kfree(rmw);
				723
Stefan Behrens	3d136a1	2012-02-03 11:20:04 +0100	[diff] [blame]	724	old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current),
				725	task_nice_ioprio(current));
				726	set_task_ioprio(current, BTRFS_IOPRIO_READA);
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	727	__reada_start_machine(fs_info);
Stefan Behrens	3d136a1	2012-02-03 11:20:04 +0100	[diff] [blame]	728	set_task_ioprio(current, old_ioprio);
Arne Jansen	7414a03	2011-05-23 14:33:49 +0200	[diff] [blame]	729	}
				730
				731	static void __reada_start_machine(struct btrfs_fs_info *fs_info)
				732	{
				733	struct btrfs_device *device;
				734	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				735	u64 enqueued;
				736	u64 total = 0;
				737	int i;
				738
				739	do {
				740	enqueued = 0;
				741	list_for_each_entry(device, &fs_devices->devices, dev_list) {
				742	if (atomic_read(&device->reada_in_flight) <
				743	MAX_IN_FLIGHT)
				744	enqueued += reada_start_machine_dev(fs_info,
				745	device);
				746	}
				747	total += enqueued;
				748	} while (enqueued && total < 10000);
				749
				750	if (enqueued == 0)
				751	return;
				752
				753	/*
				754	* If everything is already in the cache, this is effectively single
				755	* threaded. To a) not hold the caller for too long and b) to utilize
				756	* more cores, we broke the loop above after 10000 iterations and now
				757	* enqueue to workers to finish it. This will distribute the load to
				758	* the cores.
				759	*/
				760	for (i = 0; i < 2; ++i)
				761	reada_start_machine(fs_info);
				762	}
				763
				764	static void reada_start_machine(struct btrfs_fs_info *fs_info)
				765	{
				766	struct reada_machine_work *rmw;
				767
				768	rmw = kzalloc(sizeof(*rmw), GFP_NOFS);
				769	if (!rmw) {
				770	/* FIXME we cannot handle this properly right now */
				771	BUG();
				772	}
				773	rmw->work.func = reada_start_machine_worker;
				774	rmw->fs_info = fs_info;
				775
				776	btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work);
				777	}
				778
				779	#ifdef DEBUG
				780	static void dump_devs(struct btrfs_fs_info *fs_info, int all)
				781	{
				782	struct btrfs_device *device;
				783	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				784	unsigned long index;
				785	int ret;
				786	int i;
				787	int j;
				788	int cnt;
				789
				790	spin_lock(&fs_info->reada_lock);
				791	list_for_each_entry(device, &fs_devices->devices, dev_list) {
				792	printk(KERN_DEBUG "dev %lld has %d in flight\n", device->devid,
				793	atomic_read(&device->reada_in_flight));
				794	index = 0;
				795	while (1) {
				796	struct reada_zone *zone;
				797	ret = radix_tree_gang_lookup(&device->reada_zones,
				798	(void **)&zone, index, 1);
				799	if (ret == 0)
				800	break;
				801	printk(KERN_DEBUG " zone %llu-%llu elems %llu locked "
				802	"%d devs", zone->start, zone->end, zone->elems,
				803	zone->locked);
				804	for (j = 0; j < zone->ndevs; ++j) {
				805	printk(KERN_CONT " %lld",
				806	zone->devs[j]->devid);
				807	}
				808	if (device->reada_curr_zone == zone)
				809	printk(KERN_CONT " curr off %llu",
				810	device->reada_next - zone->start);
				811	printk(KERN_CONT "\n");
				812	index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
				813	}
				814	cnt = 0;
				815	index = 0;
				816	while (all) {
				817	struct reada_extent *re = NULL;
				818
				819	ret = radix_tree_gang_lookup(&device->reada_extents,
				820	(void **)&re, index, 1);
				821	if (ret == 0)
				822	break;
				823	printk(KERN_DEBUG
				824	" re: logical %llu size %u empty %d for %lld",
				825	re->logical, re->blocksize,
				826	list_empty(&re->extctl), re->scheduled_for ?
				827	re->scheduled_for->devid : -1);
				828
				829	for (i = 0; i < re->nzones; ++i) {
				830	printk(KERN_CONT " zone %llu-%llu devs",
				831	re->zones[i]->start,
				832	re->zones[i]->end);
				833	for (j = 0; j < re->zones[i]->ndevs; ++j) {
				834	printk(KERN_CONT " %lld",
				835	re->zones[i]->devs[j]->devid);
				836	}
				837	}
				838	printk(KERN_CONT "\n");
				839	index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
				840	if (++cnt > 15)
				841	break;
				842	}
				843	}
				844
				845	index = 0;
				846	cnt = 0;
				847	while (all) {
				848	struct reada_extent *re = NULL;
				849
				850	ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re,
				851	index, 1);
				852	if (ret == 0)
				853	break;
				854	if (!re->scheduled_for) {
				855	index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
				856	continue;
				857	}
				858	printk(KERN_DEBUG
				859	"re: logical %llu size %u list empty %d for %lld",
				860	re->logical, re->blocksize, list_empty(&re->extctl),
				861	re->scheduled_for ? re->scheduled_for->devid : -1);
				862	for (i = 0; i < re->nzones; ++i) {
				863	printk(KERN_CONT " zone %llu-%llu devs",
				864	re->zones[i]->start,
				865	re->zones[i]->end);
				866	for (i = 0; i < re->nzones; ++i) {
				867	printk(KERN_CONT " zone %llu-%llu devs",
				868	re->zones[i]->start,
				869	re->zones[i]->end);
				870	for (j = 0; j < re->zones[i]->ndevs; ++j) {
				871	printk(KERN_CONT " %lld",
				872	re->zones[i]->devs[j]->devid);
				873	}
				874	}
				875	}
				876	printk(KERN_CONT "\n");
				877	index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
				878	}
				879	spin_unlock(&fs_info->reada_lock);
				880	}
				881	#endif
				882
				883	/*
				884	* interface
				885	*/
				886	struct reada_control btrfs_reada_add(struct btrfs_root root,
				887	struct btrfs_key key_start, struct btrfs_key key_end)
				888	{
				889	struct reada_control *rc;
				890	u64 start;
				891	u64 generation;
				892	int level;
				893	struct extent_buffer *node;
				894	static struct btrfs_key max_key = {
				895	.objectid = (u64)-1,
				896	.type = (u8)-1,
				897	.offset = (u64)-1
				898	};
				899
				900	rc = kzalloc(sizeof(*rc), GFP_NOFS);
				901	if (!rc)
				902	return ERR_PTR(-ENOMEM);
				903
				904	rc->root = root;
				905	rc->key_start = *key_start;
				906	rc->key_end = *key_end;
				907	atomic_set(&rc->elems, 0);
				908	init_waitqueue_head(&rc->wait);
				909	kref_init(&rc->refcnt);
				910	kref_get(&rc->refcnt); /* one ref for having elements */
				911
				912	node = btrfs_root_node(root);
				913	start = node->start;
				914	level = btrfs_header_level(node);
				915	generation = btrfs_header_generation(node);
				916	free_extent_buffer(node);
				917
				918	reada_add_block(rc, start, &max_key, level, generation);
				919
				920	reada_start_machine(root->fs_info);
				921
				922	return rc;
				923	}
				924
				925	#ifdef DEBUG
				926	int btrfs_reada_wait(void *handle)
				927	{
				928	struct reada_control *rc = handle;
				929
				930	while (atomic_read(&rc->elems)) {
				931	wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
				932	5 * HZ);
				933	dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
				934	}
				935
				936	dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
				937
				938	kref_put(&rc->refcnt, reada_control_release);
				939
				940	return 0;
				941	}
				942	#else
				943	int btrfs_reada_wait(void *handle)
				944	{
				945	struct reada_control *rc = handle;
				946
				947	while (atomic_read(&rc->elems)) {
				948	wait_event(rc->wait, atomic_read(&rc->elems) == 0);
				949	}
				950
				951	kref_put(&rc->refcnt, reada_control_release);
				952
				953	return 0;
				954	}
				955	#endif
				956
				957	void btrfs_reada_detach(void *handle)
				958	{
				959	struct reada_control *rc = handle;
				960
				961	kref_put(&rc->refcnt, reada_control_release);
				962	}