Blame - fs/ubifs/gc.c - kernel/msm

blob: 618c2701d3a788ff821f35148d230090209b1eb6 [file] [log] [blame]

Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	1	/*
				2	* This file is part of UBIFS.
				3	*
				4	* Copyright (C) 2006-2008 Nokia Corporation.
				5	*
				6	* This program is free software; you can redistribute it and/or modify it
				7	* under the terms of the GNU General Public License version 2 as published by
				8	* the Free Software Foundation.
				9	*
				10	* This program is distributed in the hope that it will be useful, but WITHOUT
				11	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
				12	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
				13	* more details.
				14	*
				15	* You should have received a copy of the GNU General Public License along with
				16	* this program; if not, write to the Free Software Foundation, Inc., 51
				17	* Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
				18	*
				19	* Authors: Adrian Hunter
				20	* Artem Bityutskiy (Битюцкий Артём)
				21	*/
				22
				23	/*
				24	* This file implements garbage collection. The procedure for garbage collection
				25	* is different depending on whether a LEB as an index LEB (contains index
				26	* nodes) or not. For non-index LEBs, garbage collection finds a LEB which
				27	* contains a lot of dirty space (obsolete nodes), and copies the non-obsolete
				28	* nodes to the journal, at which point the garbage-collected LEB is free to be
				29	* reused. For index LEBs, garbage collection marks the non-obsolete index nodes
				30	* dirty in the TNC, and after the next commit, the garbage-collected LEB is
				31	* to be reused. Garbage collection will cause the number of dirty index nodes
				32	* to grow, however sufficient space is reserved for the index to ensure the
				33	* commit will never run out of space.
Artem Bityutskiy	7078202	2009-01-19 19:57:27 +0200	[diff] [blame]	34	*
				35	* Notes about dead watermark. At current UBIFS implementation we assume that
				36	* LEBs which have less than @c->dead_wm bytes of free + dirty space are full
				37	* and not worth garbage-collecting. The dead watermark is one min. I/O unit
				38	* size, or min. UBIFS node size, depending on what is greater. Indeed, UBIFS
				39	* Garbage Collector has to synchronize the GC head's write buffer before
				40	* returning, so this is about wasting one min. I/O unit. However, UBIFS GC can
				41	* actually reclaim even very small pieces of dirty space by garbage collecting
				42	* enough dirty LEBs, but we do not bother doing this at this implementation.
				43	*
				44	* Notes about dark watermark. The results of GC work depends on how big are
				45	* the UBIFS nodes GC deals with. Large nodes make GC waste more space. Indeed,
				46	* if GC move data from LEB A to LEB B and nodes in LEB A are large, GC would
				47	* have to waste large pieces of free space at the end of LEB B, because nodes
				48	* from LEB A would not fit. And the worst situation is when all nodes are of
				49	* maximum size. So dark watermark is the amount of free + dirty space in LEB
Artem Bityutskiy	f10770f	2009-03-08 15:13:00 +0200	[diff] [blame]	50	* which are guaranteed to be reclaimable. If LEB has less space, the GC might
Artem Bityutskiy	7078202	2009-01-19 19:57:27 +0200	[diff] [blame]	51	* be unable to reclaim it. So, LEBs with free + dirty greater than dark
				52	* watermark are "good" LEBs from GC's point of few. The other LEBs are not so
				53	* good, and GC takes extra care when moving them.
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	54	*/
				55
				56	#include <linux/pagemap.h>
				57	#include "ubifs.h"
				58
				59	/*
Frederik Schwarzer	025dfda	2008-10-16 19:02:37 +0200	[diff] [blame]	60	* GC may need to move more than one LEB to make progress. The below constants
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	61	* define "soft" and "hard" limits on the number of LEBs the garbage collector
				62	* may move.
				63	*/
				64	#define SOFT_LEBS_LIMIT 4
				65	#define HARD_LEBS_LIMIT 32
				66
				67	/**
				68	* switch_gc_head - switch the garbage collection journal head.
				69	* @c: UBIFS file-system description object
				70	* @buf: buffer to write
				71	* @len: length of the buffer to write
				72	* @lnum: LEB number written is returned here
				73	* @offs: offset written is returned here
				74	*
				75	* This function switch the GC head to the next LEB which is reserved in
				76	* @c->gc_lnum. Returns %0 in case of success, %-EAGAIN if commit is required,
				77	* and other negative error code in case of failures.
				78	*/
				79	static int switch_gc_head(struct ubifs_info *c)
				80	{
				81	int err, gc_lnum = c->gc_lnum;
				82	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
				83
				84	ubifs_assert(gc_lnum != -1);
				85	dbg_gc("switch GC head from LEB %d:%d to LEB %d (waste %d bytes)",
				86	wbuf->lnum, wbuf->offs + wbuf->used, gc_lnum,
				87	c->leb_size - wbuf->offs - wbuf->used);
				88
				89	err = ubifs_wbuf_sync_nolock(wbuf);
				90	if (err)
				91	return err;
				92
				93	/*
				94	* The GC write-buffer was synchronized, we may safely unmap
				95	* 'c->gc_lnum'.
				96	*/
				97	err = ubifs_leb_unmap(c, gc_lnum);
				98	if (err)
				99	return err;
				100
				101	err = ubifs_add_bud_to_log(c, GCHD, gc_lnum, 0);
				102	if (err)
				103	return err;
				104
				105	c->gc_lnum = -1;
				106	err = ubifs_wbuf_seek_nolock(wbuf, gc_lnum, 0, UBI_LONGTERM);
				107	return err;
				108	}
				109
				110	/**
Artem Bityutskiy	f10770f	2009-03-08 15:13:00 +0200	[diff] [blame]	111	* list_sort - sort a list.
				112	* @priv: private data, passed to @cmp
				113	* @head: the list to sort
				114	* @cmp: the elements comparison function
Adrian Hunter	46773be	2008-09-11 12:57:49 +0300	[diff] [blame]	115	*
Artem Bityutskiy	f10770f	2009-03-08 15:13:00 +0200	[diff] [blame]	116	* This function has been implemented by Mark J Roberts <mjr@znex.org>. It
				117	* implements "merge sort" which has O(nlog(n)) complexity. The list is sorted
				118	* in ascending order.
				119	*
				120	* The comparison function @cmp is supposed to return a negative value if @a is
				121	* than @b, and a positive value if @a is greater than @b. If @a and @b are
				122	* equivalent, then it does not matter what this function returns.
Adrian Hunter	46773be	2008-09-11 12:57:49 +0300	[diff] [blame]	123	*/
Artem Bityutskiy	f10770f	2009-03-08 15:13:00 +0200	[diff] [blame]	124	static void list_sort(void priv, struct list_head head,
				125	int (cmp)(void priv, struct list_head *a,
				126	struct list_head *b))
Adrian Hunter	46773be	2008-09-11 12:57:49 +0300	[diff] [blame]	127	{
Artem Bityutskiy	f10770f	2009-03-08 15:13:00 +0200	[diff] [blame]	128	struct list_head p, q, e, list, tail, oldhead;
				129	int insize, nmerges, psize, qsize, i;
Adrian Hunter	46773be	2008-09-11 12:57:49 +0300	[diff] [blame]	130
Artem Bityutskiy	f10770f	2009-03-08 15:13:00 +0200	[diff] [blame]	131	if (list_empty(head))
				132	return;
				133
				134	list = head->next;
				135	list_del(head);
				136	insize = 1;
				137	for (;;) {
				138	p = oldhead = list;
				139	list = tail = NULL;
				140	nmerges = 0;
				141
				142	while (p) {
				143	nmerges++;
				144	q = p;
				145	psize = 0;
				146	for (i = 0; i < insize; i++) {
				147	psize++;
				148	q = q->next == oldhead ? NULL : q->next;
				149	if (!q)
				150	break;
				151	}
				152
				153	qsize = insize;
				154	while (psize > 0 \|\| (qsize > 0 && q)) {
				155	if (!psize) {
				156	e = q;
				157	q = q->next;
				158	qsize--;
				159	if (q == oldhead)
				160	q = NULL;
				161	} else if (!qsize \|\| !q) {
				162	e = p;
				163	p = p->next;
				164	psize--;
				165	if (p == oldhead)
				166	p = NULL;
				167	} else if (cmp(priv, p, q) <= 0) {
				168	e = p;
				169	p = p->next;
				170	psize--;
				171	if (p == oldhead)
				172	p = NULL;
				173	} else {
				174	e = q;
				175	q = q->next;
				176	qsize--;
				177	if (q == oldhead)
				178	q = NULL;
				179	}
				180	if (tail)
				181	tail->next = e;
				182	else
				183	list = e;
				184	e->prev = tail;
				185	tail = e;
				186	}
				187	p = q;
				188	}
				189
				190	tail->next = list;
				191	list->prev = tail;
				192
				193	if (nmerges <= 1)
Adrian Hunter	46773be	2008-09-11 12:57:49 +0300	[diff] [blame]	194	break;
Artem Bityutskiy	f10770f	2009-03-08 15:13:00 +0200	[diff] [blame]	195
				196	insize *= 2;
Adrian Hunter	46773be	2008-09-11 12:57:49 +0300	[diff] [blame]	197	}
Artem Bityutskiy	f10770f	2009-03-08 15:13:00 +0200	[diff] [blame]	198
				199	head->next = list;
				200	head->prev = list->prev;
				201	list->prev->next = head;
				202	list->prev = head;
Adrian Hunter	46773be	2008-09-11 12:57:49 +0300	[diff] [blame]	203	}
				204
				205	/**
Artem Bityutskiy	f10770f	2009-03-08 15:13:00 +0200	[diff] [blame]	206	* data_nodes_cmp - compare 2 data nodes.
				207	* @priv: UBIFS file-system description object
				208	* @a: first data node
				209	* @a: second data node
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	210	*
Artem Bityutskiy	f10770f	2009-03-08 15:13:00 +0200	[diff] [blame]	211	* This function compares data nodes @a and @b. Returns %1 if @a has greater
				212	* inode or block number, and %-1 otherwise.
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	213	*/
Artem Bityutskiy	f10770f	2009-03-08 15:13:00 +0200	[diff] [blame]	214	int data_nodes_cmp(void priv, struct list_head a, struct list_head *b)
				215	{
				216	ino_t inuma, inumb;
				217	struct ubifs_info *c = priv;
				218	struct ubifs_scan_node sa, sb;
				219
				220	cond_resched();
				221	sa = list_entry(a, struct ubifs_scan_node, list);
				222	sb = list_entry(b, struct ubifs_scan_node, list);
				223	ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY);
				224	ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY);
				225
				226	inuma = key_inum(c, &sa->key);
				227	inumb = key_inum(c, &sb->key);
				228
				229	if (inuma == inumb) {
				230	unsigned int blka = key_block(c, &sa->key);
				231	unsigned int blkb = key_block(c, &sb->key);
				232
				233	if (blka <= blkb)
				234	return -1;
				235	} else if (inuma <= inumb)
				236	return -1;
				237
				238	return 1;
				239	}
				240
				241	/*
				242	* nondata_nodes_cmp - compare 2 non-data nodes.
				243	* @priv: UBIFS file-system description object
				244	* @a: first node
				245	* @a: second node
				246	*
				247	* This function compares nodes @a and @b. It makes sure that inode nodes go
				248	* first and sorted by length in descending order. Directory entry nodes go
				249	* after inode nodes and are sorted in ascending hash valuer order.
				250	*/
				251	int nondata_nodes_cmp(void priv, struct list_head a, struct list_head *b)
				252	{
				253	int typea, typeb;
				254	ino_t inuma, inumb;
				255	struct ubifs_info *c = priv;
				256	struct ubifs_scan_node sa, sb;
				257
				258	cond_resched();
				259	sa = list_entry(a, struct ubifs_scan_node, list);
				260	sb = list_entry(b, struct ubifs_scan_node, list);
				261	typea = key_type(c, &sa->key);
				262	typeb = key_type(c, &sb->key);
				263	ubifs_assert(typea != UBIFS_DATA_KEY && typeb != UBIFS_DATA_KEY);
				264
				265	/* Inodes go before directory entries */
				266	if (typea == UBIFS_INO_KEY) {
				267	if (typeb == UBIFS_INO_KEY)
				268	return sb->len - sa->len;
				269	return -1;
				270	}
				271	if (typeb == UBIFS_INO_KEY)
				272	return 1;
				273
				274	ubifs_assert(typea == UBIFS_DENT_KEY && typeb == UBIFS_DENT_KEY);
				275	inuma = key_inum(c, &sa->key);
				276	inumb = key_inum(c, &sb->key);
				277
				278	if (inuma == inumb) {
				279	uint32_t hasha = key_hash(c, &sa->key);
				280	uint32_t hashb = key_hash(c, &sb->key);
				281
				282	if (hasha <= hashb)
				283	return -1;
				284	} else if (inuma <= inumb)
				285	return -1;
				286
				287	return 1;
				288	}
				289
				290	/**
				291	* sort_nodes - sort nodes for GC.
				292	* @c: UBIFS file-system description object
				293	* @sleb: describes nodes to sort and contains the result on exit
				294	* @nondata: contains non-data nodes on exit
				295	* @min: minimum node size is returned here
				296	*
				297	* This function sorts the list of inodes to garbage collect. First of all, it
				298	* kills obsolete nodes and separates data and non-data nodes to the
				299	* @sleb->nodes and @nondata lists correspondingly.
				300	*
				301	* Data nodes are then sorted in block number order - this is important for
				302	* bulk-read; data nodes with lower inode number go before data nodes with
				303	* higher inode number, and data nodes with lower block number go before data
				304	* nodes with higher block number;
				305	*
				306	* Non-data nodes are sorted as follows.
				307	* o First go inode nodes - they are sorted in descending length order.
				308	* o Then go directory entry nodes - they are sorted in hash order, which
				309	* should supposedly optimize 'readdir()'. Direntry nodes with lower parent
				310	* inode number go before direntry nodes with higher parent inode number,
				311	* and direntry nodes with lower name hash values go before direntry nodes
				312	* with higher name hash values.
				313	*
				314	* This function returns zero in case of success and a negative error code in
				315	* case of failure.
				316	*/
				317	static int sort_nodes(struct ubifs_info c, struct ubifs_scan_leb sleb,
				318	struct list_head nondata, int min)
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	319	{
				320	struct ubifs_scan_node snod, tmp;
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	321
Artem Bityutskiy	f10770f	2009-03-08 15:13:00 +0200	[diff] [blame]	322	*min = INT_MAX;
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	323
Artem Bityutskiy	f10770f	2009-03-08 15:13:00 +0200	[diff] [blame]	324	/* Separate data nodes and non-data nodes */
				325	list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
				326	int err;
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	327
				328	ubifs_assert(snod->type != UBIFS_IDX_NODE);
				329	ubifs_assert(snod->type != UBIFS_REF_NODE);
				330	ubifs_assert(snod->type != UBIFS_CS_NODE);
				331
				332	err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,
				333	snod->offs, 0);
				334	if (err < 0)
Artem Bityutskiy	f10770f	2009-03-08 15:13:00 +0200	[diff] [blame]	335	return err;
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	336
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	337	if (!err) {
				338	/* The node is obsolete, remove it from the list */
Artem Bityutskiy	f10770f	2009-03-08 15:13:00 +0200	[diff] [blame]	339	list_del(&snod->list);
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	340	kfree(snod);
				341	continue;
				342	}
				343
Artem Bityutskiy	f10770f	2009-03-08 15:13:00 +0200	[diff] [blame]	344	if (snod->len < *min)
				345	*min = snod->len;
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	346
Artem Bityutskiy	f10770f	2009-03-08 15:13:00 +0200	[diff] [blame]	347	if (key_type(c, &snod->key) != UBIFS_DATA_KEY)
				348	list_move_tail(&snod->list, nondata);
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	349	}
				350
Artem Bityutskiy	f10770f	2009-03-08 15:13:00 +0200	[diff] [blame]	351	/* Sort data and non-data nodes */
				352	list_sort(c, &sleb->nodes, &data_nodes_cmp);
				353	list_sort(c, nondata, &nondata_nodes_cmp);
				354	return 0;
				355	}
				356
				357	/**
				358	* move_node - move a node.
				359	* @c: UBIFS file-system description object
				360	* @sleb: describes the LEB to move nodes from
				361	* @snod: the mode to move
				362	* @wbuf: write-buffer to move node to
				363	*
				364	* This function moves node @snod to @wbuf, changes TNC correspondingly, and
				365	* destroys @snod. Returns zero in case of success and a negative error code in
				366	* case of failure.
				367	*/
				368	static int move_node(struct ubifs_info c, struct ubifs_scan_leb sleb,
				369	struct ubifs_scan_node snod, struct ubifs_wbuf wbuf)
				370	{
				371	int err, new_lnum = wbuf->lnum, new_offs = wbuf->offs + wbuf->used;
				372
				373	cond_resched();
				374	err = ubifs_wbuf_write_nolock(wbuf, snod->node, snod->len);
				375	if (err)
				376	return err;
				377
				378	err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
				379	snod->offs, new_lnum, new_offs,
				380	snod->len);
				381	list_del(&snod->list);
				382	kfree(snod);
				383	return err;
				384	}
				385
				386	/**
				387	* move_nodes - move nodes.
				388	* @c: UBIFS file-system description object
				389	* @sleb: describes the LEB to move nodes from
				390	*
				391	* This function moves valid nodes from data LEB described by @sleb to the GC
				392	* journal head. This function returns zero in case of success, %-EAGAIN if
				393	* commit is required, and other negative error codes in case of other
				394	* failures.
				395	*/
				396	static int move_nodes(struct ubifs_info c, struct ubifs_scan_leb sleb)
				397	{
				398	int err, min;
				399	LIST_HEAD(nondata);
				400	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	401
				402	if (wbuf->lnum == -1) {
				403	/*
				404	* The GC journal head is not set, because it is the first GC
				405	* invocation since mount.
				406	*/
				407	err = switch_gc_head(c);
				408	if (err)
Artem Bityutskiy	f10770f	2009-03-08 15:13:00 +0200	[diff] [blame]	409	return err;
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	410	}
				411
Artem Bityutskiy	f10770f	2009-03-08 15:13:00 +0200	[diff] [blame]	412	err = sort_nodes(c, sleb, &nondata, &min);
				413	if (err)
				414	goto out;
				415
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	416	/* Write nodes to their new location. Use the first-fit strategy */
				417	while (1) {
Artem Bityutskiy	f10770f	2009-03-08 15:13:00 +0200	[diff] [blame]	418	int avail;
				419	struct ubifs_scan_node snod, tmp;
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	420
Artem Bityutskiy	f10770f	2009-03-08 15:13:00 +0200	[diff] [blame]	421	/* Move data nodes */
				422	list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
				423	avail = c->leb_size - wbuf->offs - wbuf->used;
				424	if (snod->len > avail)
				425	/*
				426	* Do not skip data nodes in order to optimize
				427	* bulk-read.
				428	*/
				429	break;
				430
				431	err = move_node(c, sleb, snod, wbuf);
				432	if (err)
				433	goto out;
				434	}
				435
				436	/* Move non-data nodes */
				437	list_for_each_entry_safe(snod, tmp, &nondata, list) {
				438	avail = c->leb_size - wbuf->offs - wbuf->used;
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	439	if (avail < min)
				440	break;
				441
Artem Bityutskiy	f10770f	2009-03-08 15:13:00 +0200	[diff] [blame]	442	if (snod->len > avail) {
				443	/*
				444	* Keep going only if this is an inode with
				445	* some data. Otherwise stop and switch the GC
				446	* head. IOW, we assume that data-less inode
				447	* nodes and direntry nodes are roughly of the
				448	* same size.
				449	*/
				450	if (key_type(c, &snod->key) == UBIFS_DENT_KEY \|\|
				451	snod->len == UBIFS_INO_NODE_SZ)
				452	break;
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	453	continue;
Artem Bityutskiy	f10770f	2009-03-08 15:13:00 +0200	[diff] [blame]	454	}
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	455
Artem Bityutskiy	f10770f	2009-03-08 15:13:00 +0200	[diff] [blame]	456	err = move_node(c, sleb, snod, wbuf);
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	457	if (err)
				458	goto out;
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	459	}
				460
Artem Bityutskiy	f10770f	2009-03-08 15:13:00 +0200	[diff] [blame]	461	if (list_empty(&sleb->nodes) && list_empty(&nondata))
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	462	break;
				463
				464	/*
				465	* Waste the rest of the space in the LEB and switch to the
				466	* next LEB.
				467	*/
				468	err = switch_gc_head(c);
				469	if (err)
				470	goto out;
				471	}
				472
				473	return 0;
				474
				475	out:
Artem Bityutskiy	f10770f	2009-03-08 15:13:00 +0200	[diff] [blame]	476	list_splice_tail(&nondata, &sleb->nodes);
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	477	return err;
				478	}
				479
				480	/**
				481	* gc_sync_wbufs - sync write-buffers for GC.
				482	* @c: UBIFS file-system description object
				483	*
				484	* We must guarantee that obsoleting nodes are on flash. Unfortunately they may
				485	* be in a write-buffer instead. That is, a node could be written to a
				486	* write-buffer, obsoleting another node in a LEB that is GC'd. If that LEB is
				487	* erased before the write-buffer is sync'd and then there is an unclean
				488	* unmount, then an existing node is lost. To avoid this, we sync all
				489	* write-buffers.
				490	*
				491	* This function returns %0 on success or a negative error code on failure.
				492	*/
				493	static int gc_sync_wbufs(struct ubifs_info *c)
				494	{
				495	int err, i;
				496
				497	for (i = 0; i < c->jhead_cnt; i++) {
				498	if (i == GCHD)
				499	continue;
				500	err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
				501	if (err)
				502	return err;
				503	}
				504	return 0;
				505	}
				506
				507	/**
				508	* ubifs_garbage_collect_leb - garbage-collect a logical eraseblock.
				509	* @c: UBIFS file-system description object
				510	* @lp: describes the LEB to garbage collect
				511	*
				512	* This function garbage-collects an LEB and returns one of the @LEB_FREED,
				513	* @LEB_RETAINED, etc positive codes in case of success, %-EAGAIN if commit is
				514	* required, and other negative error codes in case of failures.
				515	*/
				516	int ubifs_garbage_collect_leb(struct ubifs_info c, struct ubifs_lprops lp)
				517	{
				518	struct ubifs_scan_leb *sleb;
				519	struct ubifs_scan_node *snod;
				520	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
				521	int err = 0, lnum = lp->lnum;
				522
				523	ubifs_assert(c->gc_lnum != -1 \|\| wbuf->offs + wbuf->used == 0 \|\|
				524	c->need_recovery);
				525	ubifs_assert(c->gc_lnum != lnum);
				526	ubifs_assert(wbuf->lnum != lnum);
				527
				528	/*
				529	* We scan the entire LEB even though we only really need to scan up to
				530	* (c->leb_size - lp->free).
				531	*/
Artem Bityutskiy	348709b	2009-08-25 15:00:55 +0300	[diff] [blame]	532	sleb = ubifs_scan(c, lnum, 0, c->sbuf, 0);
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	533	if (IS_ERR(sleb))
				534	return PTR_ERR(sleb);
				535
				536	ubifs_assert(!list_empty(&sleb->nodes));
				537	snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list);
				538
				539	if (snod->type == UBIFS_IDX_NODE) {
				540	struct ubifs_gced_idx_leb *idx_gc;
				541
				542	dbg_gc("indexing LEB %d (free %d, dirty %d)",
				543	lnum, lp->free, lp->dirty);
				544	list_for_each_entry(snod, &sleb->nodes, list) {
				545	struct ubifs_idx_node *idx = snod->node;
				546	int level = le16_to_cpu(idx->level);
				547
				548	ubifs_assert(snod->type == UBIFS_IDX_NODE);
				549	key_read(c, ubifs_idx_key(c, idx), &snod->key);
				550	err = ubifs_dirty_idx_node(c, &snod->key, level, lnum,
				551	snod->offs);
				552	if (err)
				553	goto out;
				554	}
				555
				556	idx_gc = kmalloc(sizeof(struct ubifs_gced_idx_leb), GFP_NOFS);
				557	if (!idx_gc) {
				558	err = -ENOMEM;
				559	goto out;
				560	}
				561
				562	idx_gc->lnum = lnum;
				563	idx_gc->unmap = 0;
				564	list_add(&idx_gc->list, &c->idx_gc);
				565
				566	/*
				567	* Don't release the LEB until after the next commit, because
Adrian Hunter	227c75c	2009-01-29 11:53:51 +0200	[diff] [blame]	568	* it may contain data which is needed for recovery. So
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	569	* although we freed this LEB, it will become usable only after
				570	* the commit.
				571	*/
				572	err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0,
				573	LPROPS_INDEX, 1);
				574	if (err)
				575	goto out;
				576	err = LEB_FREED_IDX;
				577	} else {
				578	dbg_gc("data LEB %d (free %d, dirty %d)",
				579	lnum, lp->free, lp->dirty);
				580
				581	err = move_nodes(c, sleb);
				582	if (err)
Adrian Hunter	6dcfac4	2008-09-12 12:27:47 +0300	[diff] [blame]	583	goto out_inc_seq;
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	584
				585	err = gc_sync_wbufs(c);
				586	if (err)
Adrian Hunter	6dcfac4	2008-09-12 12:27:47 +0300	[diff] [blame]	587	goto out_inc_seq;
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	588
				589	err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, 0, 0);
				590	if (err)
Adrian Hunter	6dcfac4	2008-09-12 12:27:47 +0300	[diff] [blame]	591	goto out_inc_seq;
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	592
Adrian Hunter	601c0bc	2008-08-22 14:23:35 +0300	[diff] [blame]	593	/* Allow for races with TNC */
				594	c->gced_lnum = lnum;
				595	smp_wmb();
				596	c->gc_seq += 1;
				597	smp_wmb();
				598
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	599	if (c->gc_lnum == -1) {
				600	c->gc_lnum = lnum;
				601	err = LEB_RETAINED;
				602	} else {
				603	err = ubifs_wbuf_sync_nolock(wbuf);
				604	if (err)
				605	goto out;
				606
				607	err = ubifs_leb_unmap(c, lnum);
				608	if (err)
				609	goto out;
				610
				611	err = LEB_FREED;
				612	}
				613	}
				614
				615	out:
				616	ubifs_scan_destroy(sleb);
				617	return err;
Adrian Hunter	6dcfac4	2008-09-12 12:27:47 +0300	[diff] [blame]	618
				619	out_inc_seq:
				620	/* We may have moved at least some nodes so allow for races with TNC */
				621	c->gced_lnum = lnum;
				622	smp_wmb();
				623	c->gc_seq += 1;
				624	smp_wmb();
				625	goto out;
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	626	}
				627
				628	/**
				629	* ubifs_garbage_collect - UBIFS garbage collector.
				630	* @c: UBIFS file-system description object
				631	* @anyway: do GC even if there are free LEBs
				632	*
				633	* This function does out-of-place garbage collection. The return codes are:
				634	* o positive LEB number if the LEB has been freed and may be used;
				635	* o %-EAGAIN if the caller has to run commit;
				636	* o %-ENOSPC if GC failed to make any progress;
				637	* o other negative error codes in case of other errors.
				638	*
				639	* Garbage collector writes data to the journal when GC'ing data LEBs, and just
				640	* marking indexing nodes dirty when GC'ing indexing LEBs. Thus, at some point
				641	* commit may be required. But commit cannot be run from inside GC, because the
				642	* caller might be holding the commit lock, so %-EAGAIN is returned instead;
				643	* And this error code means that the caller has to run commit, and re-run GC
				644	* if there is still no free space.
				645	*
				646	* There are many reasons why this function may return %-EAGAIN:
				647	* o the log is full and there is no space to write an LEB reference for
				648	* @c->gc_lnum;
				649	* o the journal is too large and exceeds size limitations;
				650	* o GC moved indexing LEBs, but they can be used only after the commit;
				651	* o the shrinker fails to find clean znodes to free and requests the commit;
				652	* o etc.
				653	*
				654	* Note, if the file-system is close to be full, this function may return
				655	* %-EAGAIN infinitely, so the caller has to limit amount of re-invocations of
				656	* the function. E.g., this happens if the limits on the journal size are too
				657	* tough and GC writes too much to the journal before an LEB is freed. This
				658	* might also mean that the journal is too large, and the TNC becomes to big,
				659	* so that the shrinker is constantly called, finds not clean znodes to free,
				660	* and requests commit. Well, this may also happen if the journal is all right,
				661	* but another kernel process consumes too much memory. Anyway, infinite
				662	* %-EAGAIN may happen, but in some extreme/misconfiguration cases.
				663	*/
				664	int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
				665	{
				666	int i, err, ret, min_space = c->dead_wm;
				667	struct ubifs_lprops lp;
				668	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
				669
				670	ubifs_assert_cmt_locked(c);
				671
				672	if (ubifs_gc_should_commit(c))
				673	return -EAGAIN;
				674
				675	mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
				676
				677	if (c->ro_media) {
				678	ret = -EROFS;
				679	goto out_unlock;
				680	}
				681
				682	/* We expect the write-buffer to be empty on entry */
				683	ubifs_assert(!wbuf->used);
				684
				685	for (i = 0; ; i++) {
				686	int space_before = c->leb_size - wbuf->offs - wbuf->used;
				687	int space_after;
				688
				689	cond_resched();
				690
				691	/* Give the commit an opportunity to run */
				692	if (ubifs_gc_should_commit(c)) {
				693	ret = -EAGAIN;
				694	break;
				695	}
				696
				697	if (i > SOFT_LEBS_LIMIT && !list_empty(&c->idx_gc)) {
				698	/*
				699	* We've done enough iterations. Indexing LEBs were
				700	* moved and will be available after the commit.
				701	*/
				702	dbg_gc("soft limit, some index LEBs GC'ed, -EAGAIN");
				703	ubifs_commit_required(c);
				704	ret = -EAGAIN;
				705	break;
				706	}
				707
				708	if (i > HARD_LEBS_LIMIT) {
				709	/*
				710	* We've moved too many LEBs and have not made
				711	* progress, give up.
				712	*/
				713	dbg_gc("hard limit, -ENOSPC");
				714	ret = -ENOSPC;
				715	break;
				716	}
				717
				718	/*
				719	* Empty and freeable LEBs can turn up while we waited for
				720	* the wbuf lock, or while we have been running GC. In that
				721	* case, we should just return one of those instead of
				722	* continuing to GC dirty LEBs. Hence we request
				723	* 'ubifs_find_dirty_leb()' to return an empty LEB if it can.
				724	*/
				725	ret = ubifs_find_dirty_leb(c, &lp, min_space, anyway ? 0 : 1);
				726	if (ret) {
				727	if (ret == -ENOSPC)
				728	dbg_gc("no more dirty LEBs");
				729	break;
				730	}
				731
				732	dbg_gc("found LEB %d: free %d, dirty %d, sum %d "
				733	"(min. space %d)", lp.lnum, lp.free, lp.dirty,
				734	lp.free + lp.dirty, min_space);
				735
				736	if (lp.free + lp.dirty == c->leb_size) {
				737	/* An empty LEB was returned */
				738	dbg_gc("LEB %d is free, return it", lp.lnum);
				739	/*
				740	* ubifs_find_dirty_leb() doesn't return freeable index
				741	* LEBs.
				742	*/
				743	ubifs_assert(!(lp.flags & LPROPS_INDEX));
				744	if (lp.free != c->leb_size) {
				745	/*
				746	* Write buffers must be sync'd before
				747	* unmapping freeable LEBs, because one of them
				748	* may contain data which obsoletes something
				749	* in 'lp.pnum'.
				750	*/
				751	ret = gc_sync_wbufs(c);
				752	if (ret)
				753	goto out;
				754	ret = ubifs_change_one_lp(c, lp.lnum,
				755	c->leb_size, 0, 0, 0,
				756	0);
				757	if (ret)
				758	goto out;
				759	}
				760	ret = ubifs_leb_unmap(c, lp.lnum);
				761	if (ret)
				762	goto out;
				763	ret = lp.lnum;
				764	break;
				765	}
				766
				767	space_before = c->leb_size - wbuf->offs - wbuf->used;
				768	if (wbuf->lnum == -1)
				769	space_before = 0;
				770
				771	ret = ubifs_garbage_collect_leb(c, &lp);
				772	if (ret < 0) {
				773	if (ret == -EAGAIN \|\| ret == -ENOSPC) {
				774	/*
				775	* These codes are not errors, so we have to
				776	* return the LEB to lprops. But if the
				777	* 'ubifs_return_leb()' function fails, its
				778	* failure code is propagated to the caller
				779	* instead of the original '-EAGAIN' or
				780	* '-ENOSPC'.
				781	*/
				782	err = ubifs_return_leb(c, lp.lnum);
				783	if (err)
				784	ret = err;
				785	break;
				786	}
				787	goto out;
				788	}
				789
				790	if (ret == LEB_FREED) {
				791	/* An LEB has been freed and is ready for use */
				792	dbg_gc("LEB %d freed, return", lp.lnum);
				793	ret = lp.lnum;
				794	break;
				795	}
				796
				797	if (ret == LEB_FREED_IDX) {
				798	/*
				799	* This was an indexing LEB and it cannot be
				800	* immediately used. And instead of requesting the
				801	* commit straight away, we try to garbage collect some
				802	* more.
				803	*/
				804	dbg_gc("indexing LEB %d freed, continue", lp.lnum);
				805	continue;
				806	}
				807
				808	ubifs_assert(ret == LEB_RETAINED);
				809	space_after = c->leb_size - wbuf->offs - wbuf->used;
				810	dbg_gc("LEB %d retained, freed %d bytes", lp.lnum,
				811	space_after - space_before);
				812
				813	if (space_after > space_before) {
				814	/* GC makes progress, keep working */
				815	min_space >>= 1;
				816	if (min_space < c->dead_wm)
				817	min_space = c->dead_wm;
				818	continue;
				819	}
				820
				821	dbg_gc("did not make progress");
				822
				823	/*
				824	* GC moved an LEB bud have not done any progress. This means
				825	* that the previous GC head LEB contained too few free space
				826	* and the LEB which was GC'ed contained only large nodes which
				827	* did not fit that space.
				828	*
				829	* We can do 2 things:
				830	* 1. pick another LEB in a hope it'll contain a small node
				831	* which will fit the space we have at the end of current GC
				832	* head LEB, but there is no guarantee, so we try this out
				833	* unless we have already been working for too long;
				834	* 2. request an LEB with more dirty space, which will force
				835	* 'ubifs_find_dirty_leb()' to start scanning the lprops
				836	* table, instead of just picking one from the heap
				837	* (previously it already picked the dirtiest LEB).
				838	*/
				839	if (i < SOFT_LEBS_LIMIT) {
				840	dbg_gc("try again");
				841	continue;
				842	}
				843
				844	min_space <<= 1;
				845	if (min_space > c->dark_wm)
				846	min_space = c->dark_wm;
				847	dbg_gc("set min. space to %d", min_space);
				848	}
				849
				850	if (ret == -ENOSPC && !list_empty(&c->idx_gc)) {
				851	dbg_gc("no space, some index LEBs GC'ed, -EAGAIN");
				852	ubifs_commit_required(c);
				853	ret = -EAGAIN;
				854	}
				855
				856	err = ubifs_wbuf_sync_nolock(wbuf);
				857	if (!err)
				858	err = ubifs_leb_unmap(c, c->gc_lnum);
				859	if (err) {
				860	ret = err;
				861	goto out;
				862	}
				863	out_unlock:
				864	mutex_unlock(&wbuf->io_mutex);
				865	return ret;
				866
				867	out:
				868	ubifs_assert(ret < 0);
				869	ubifs_assert(ret != -ENOSPC && ret != -EAGAIN);
				870	ubifs_ro_mode(c, ret);
				871	ubifs_wbuf_sync_nolock(wbuf);
				872	mutex_unlock(&wbuf->io_mutex);
				873	ubifs_return_leb(c, lp.lnum);
				874	return ret;
				875	}
				876
				877	/**
				878	* ubifs_gc_start_commit - garbage collection at start of commit.
				879	* @c: UBIFS file-system description object
				880	*
				881	* If a LEB has only dirty and free space, then we may safely unmap it and make
				882	* it free. Note, we cannot do this with indexing LEBs because dirty space may
				883	* correspond index nodes that are required for recovery. In that case, the
				884	* LEB cannot be unmapped until after the next commit.
				885	*
				886	* This function returns %0 upon success and a negative error code upon failure.
				887	*/
				888	int ubifs_gc_start_commit(struct ubifs_info *c)
				889	{
				890	struct ubifs_gced_idx_leb *idx_gc;
				891	const struct ubifs_lprops *lp;
				892	int err = 0, flags;
				893
				894	ubifs_get_lprops(c);
				895
				896	/*
				897	* Unmap (non-index) freeable LEBs. Note that recovery requires that all
				898	* wbufs are sync'd before this, which is done in 'do_commit()'.
				899	*/
				900	while (1) {
				901	lp = ubifs_fast_find_freeable(c);
Hirofumi Nakagawa	8d47aef	2008-08-21 17:16:40 +0300	[diff] [blame]	902	if (IS_ERR(lp)) {
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	903	err = PTR_ERR(lp);
				904	goto out;
				905	}
				906	if (!lp)
				907	break;
				908	ubifs_assert(!(lp->flags & LPROPS_TAKEN));
				909	ubifs_assert(!(lp->flags & LPROPS_INDEX));
				910	err = ubifs_leb_unmap(c, lp->lnum);
				911	if (err)
				912	goto out;
				913	lp = ubifs_change_lp(c, lp, c->leb_size, 0, lp->flags, 0);
Hirofumi Nakagawa	8d47aef	2008-08-21 17:16:40 +0300	[diff] [blame]	914	if (IS_ERR(lp)) {
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	915	err = PTR_ERR(lp);
				916	goto out;
				917	}
				918	ubifs_assert(!(lp->flags & LPROPS_TAKEN));
				919	ubifs_assert(!(lp->flags & LPROPS_INDEX));
				920	}
				921
				922	/* Mark GC'd index LEBs OK to unmap after this commit finishes */
				923	list_for_each_entry(idx_gc, &c->idx_gc, list)
				924	idx_gc->unmap = 1;
				925
				926	/* Record index freeable LEBs for unmapping after commit */
				927	while (1) {
				928	lp = ubifs_fast_find_frdi_idx(c);
Hirofumi Nakagawa	8d47aef	2008-08-21 17:16:40 +0300	[diff] [blame]	929	if (IS_ERR(lp)) {
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	930	err = PTR_ERR(lp);
				931	goto out;
				932	}
				933	if (!lp)
				934	break;
				935	idx_gc = kmalloc(sizeof(struct ubifs_gced_idx_leb), GFP_NOFS);
				936	if (!idx_gc) {
				937	err = -ENOMEM;
				938	goto out;
				939	}
				940	ubifs_assert(!(lp->flags & LPROPS_TAKEN));
				941	ubifs_assert(lp->flags & LPROPS_INDEX);
				942	/* Don't release the LEB until after the next commit */
				943	flags = (lp->flags \| LPROPS_TAKEN) ^ LPROPS_INDEX;
				944	lp = ubifs_change_lp(c, lp, c->leb_size, 0, flags, 1);
Hirofumi Nakagawa	8d47aef	2008-08-21 17:16:40 +0300	[diff] [blame]	945	if (IS_ERR(lp)) {
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	946	err = PTR_ERR(lp);
				947	kfree(idx_gc);
				948	goto out;
				949	}
				950	ubifs_assert(lp->flags & LPROPS_TAKEN);
				951	ubifs_assert(!(lp->flags & LPROPS_INDEX));
				952	idx_gc->lnum = lp->lnum;
				953	idx_gc->unmap = 1;
				954	list_add(&idx_gc->list, &c->idx_gc);
				955	}
				956	out:
				957	ubifs_release_lprops(c);
				958	return err;
				959	}
				960
				961	/**
				962	* ubifs_gc_end_commit - garbage collection at end of commit.
				963	* @c: UBIFS file-system description object
				964	*
				965	* This function completes out-of-place garbage collection of index LEBs.
				966	*/
				967	int ubifs_gc_end_commit(struct ubifs_info *c)
				968	{
				969	struct ubifs_gced_idx_leb idx_gc, tmp;
				970	struct ubifs_wbuf *wbuf;
				971	int err = 0;
				972
				973	wbuf = &c->jheads[GCHD].wbuf;
				974	mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
				975	list_for_each_entry_safe(idx_gc, tmp, &c->idx_gc, list)
				976	if (idx_gc->unmap) {
				977	dbg_gc("LEB %d", idx_gc->lnum);
				978	err = ubifs_leb_unmap(c, idx_gc->lnum);
				979	if (err)
				980	goto out;
				981	err = ubifs_change_one_lp(c, idx_gc->lnum, LPROPS_NC,
				982	LPROPS_NC, 0, LPROPS_TAKEN, -1);
				983	if (err)
				984	goto out;
				985	list_del(&idx_gc->list);
				986	kfree(idx_gc);
				987	}
				988	out:
				989	mutex_unlock(&wbuf->io_mutex);
				990	return err;
				991	}
				992
				993	/**
				994	* ubifs_destroy_idx_gc - destroy idx_gc list.
				995	* @c: UBIFS file-system description object
				996	*
Adrian Hunter	b466f17	2009-01-29 12:59:33 +0200	[diff] [blame]	997	* This function destroys the @c->idx_gc list. It is called when unmounting
				998	* so locks are not needed. Returns zero in case of success and a negative
				999	* error code in case of failure.
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	1000	*/
Adrian Hunter	b466f17	2009-01-29 12:59:33 +0200	[diff] [blame]	1001	void ubifs_destroy_idx_gc(struct ubifs_info *c)
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	1002	{
				1003	while (!list_empty(&c->idx_gc)) {
				1004	struct ubifs_gced_idx_leb *idx_gc;
				1005
				1006	idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb,
				1007	list);
Adrian Hunter	b466f17	2009-01-29 12:59:33 +0200	[diff] [blame]	1008	c->idx_gc_cnt -= 1;
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	1009	list_del(&idx_gc->list);
				1010	kfree(idx_gc);
				1011	}
Artem Bityutskiy	1e51764	2008-07-14 19:08:37 +0300	[diff] [blame]	1012	}
				1013
				1014	/**
				1015	* ubifs_get_idx_gc_leb - get a LEB from GC'd index LEB list.
				1016	* @c: UBIFS file-system description object
				1017	*
				1018	* Called during start commit so locks are not needed.
				1019	*/
				1020	int ubifs_get_idx_gc_leb(struct ubifs_info *c)
				1021	{
				1022	struct ubifs_gced_idx_leb *idx_gc;
				1023	int lnum;
				1024
				1025	if (list_empty(&c->idx_gc))
				1026	return -ENOSPC;
				1027	idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb, list);
				1028	lnum = idx_gc->lnum;
				1029	/* c->idx_gc_cnt is updated by the caller when lprops are updated */
				1030	list_del(&idx_gc->list);
				1031	kfree(idx_gc);
				1032	return lnum;
				1033	}