blob: 9e1d05666fed5d1ad03589995f8ecb3f78ec7b82 [file] [log] [blame]
Artem Bityutskiy1e517642008-07-14 19:08:37 +03001/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Artem Bityutskiy (Битюцкий Артём)
20 * Adrian Hunter
21 */
22
23/*
24 * This file implements UBIFS shrinker which evicts clean znodes from the TNC
25 * tree when Linux VM needs more RAM.
26 *
27 * We do not implement any LRU lists to find oldest znodes to free because it
28 * would add additional overhead to the file system fast paths. So the shrinker
29 * just walks the TNC tree when searching for znodes to free.
30 *
31 * If the root of a TNC sub-tree is clean and old enough, then the children are
32 * also clean and old enough. So the shrinker walks the TNC in level order and
33 * dumps entire sub-trees.
34 *
35 * The age of znodes is just the time-stamp when they were last looked at.
36 * The current shrinker first tries to evict old znodes, then young ones.
37 *
38 * Since the shrinker is global, it has to protect against races with FS
39 * un-mounts, which is done by the 'ubifs_infos_lock' and 'c->umount_mutex'.
40 */
41
42#include "ubifs.h"
43
44/* List of all UBIFS file-system instances */
45LIST_HEAD(ubifs_infos);
46
47/*
48 * We number each shrinker run and record the number on the ubifs_info structure
49 * so that we can easily work out which ubifs_info structures have already been
50 * done by the current run.
51 */
52static unsigned int shrinker_run_no;
53
54/* Protects 'ubifs_infos' list */
55DEFINE_SPINLOCK(ubifs_infos_lock);
56
57/* Global clean znode counter (for all mounted UBIFS instances) */
58atomic_long_t ubifs_clean_zn_cnt;
59
60/**
61 * shrink_tnc - shrink TNC tree.
62 * @c: UBIFS file-system description object
63 * @nr: number of znodes to free
64 * @age: the age of znodes to free
65 * @contention: if any contention, this is set to %1
66 *
67 * This function traverses TNC tree and frees clean znodes. It does not free
68 * clean znodes which younger then @age. Returns number of freed znodes.
69 */
70static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention)
71{
72 int total_freed = 0;
73 struct ubifs_znode *znode, *zprev;
74 int time = get_seconds();
75
76 ubifs_assert(mutex_is_locked(&c->umount_mutex));
77 ubifs_assert(mutex_is_locked(&c->tnc_mutex));
78
79 if (!c->zroot.znode || atomic_long_read(&c->clean_zn_cnt) == 0)
80 return 0;
81
82 /*
83 * Traverse the TNC tree in levelorder manner, so that it is possible
84 * to destroy large sub-trees. Indeed, if a znode is old, then all its
85 * children are older or of the same age.
86 *
87 * Note, we are holding 'c->tnc_mutex', so we do not have to lock the
88 * 'c->space_lock' when _reading_ 'c->clean_zn_cnt', because it is
89 * changed only when the 'c->tnc_mutex' is held.
90 */
91 zprev = NULL;
92 znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);
93 while (znode && total_freed < nr &&
94 atomic_long_read(&c->clean_zn_cnt) > 0) {
95 int freed;
96
97 /*
98 * If the znode is clean, but it is in the 'c->cnext' list, this
99 * means that this znode has just been written to flash as a
100 * part of commit and was marked clean. They will be removed
101 * from the list at end commit. We cannot change the list,
102 * because it is not protected by any mutex (design decision to
103 * make commit really independent and parallel to main I/O). So
104 * we just skip these znodes.
105 *
106 * Note, the 'clean_zn_cnt' counters are not updated until
107 * after the commit, so the UBIFS shrinker does not report
108 * the znodes which are in the 'c->cnext' list as freeable.
109 *
110 * Also note, if the root of a sub-tree is not in 'c->cnext',
111 * then the whole sub-tree is not in 'c->cnext' as well, so it
112 * is safe to dump whole sub-tree.
113 */
114
115 if (znode->cnext) {
116 /*
117 * Very soon these znodes will be removed from the list
118 * and become freeable.
119 */
120 *contention = 1;
121 } else if (!ubifs_zn_dirty(znode) &&
122 abs(time - znode->time) >= age) {
123 if (znode->parent)
124 znode->parent->zbranch[znode->iip].znode = NULL;
125 else
126 c->zroot.znode = NULL;
127
128 freed = ubifs_destroy_tnc_subtree(znode);
129 atomic_long_sub(freed, &ubifs_clean_zn_cnt);
130 atomic_long_sub(freed, &c->clean_zn_cnt);
131 ubifs_assert(atomic_long_read(&c->clean_zn_cnt) >= 0);
132 total_freed += freed;
133 znode = zprev;
134 }
135
136 if (unlikely(!c->zroot.znode))
137 break;
138
139 zprev = znode;
140 znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode);
141 cond_resched();
142 }
143
144 return total_freed;
145}
146
147/**
148 * shrink_tnc_trees - shrink UBIFS TNC trees.
149 * @nr: number of znodes to free
150 * @age: the age of znodes to free
151 * @contention: if any contention, this is set to %1
152 *
153 * This function walks the list of mounted UBIFS file-systems and frees clean
Frederik Schwarzer025dfda2008-10-16 19:02:37 +0200154 * znodes which are older than @age, until at least @nr znodes are freed.
Artem Bityutskiy1e517642008-07-14 19:08:37 +0300155 * Returns the number of freed znodes.
156 */
157static int shrink_tnc_trees(int nr, int age, int *contention)
158{
159 struct ubifs_info *c;
160 struct list_head *p;
161 unsigned int run_no;
162 int freed = 0;
163
164 spin_lock(&ubifs_infos_lock);
165 do {
166 run_no = ++shrinker_run_no;
167 } while (run_no == 0);
168 /* Iterate over all mounted UBIFS file-systems and try to shrink them */
169 p = ubifs_infos.next;
170 while (p != &ubifs_infos) {
171 c = list_entry(p, struct ubifs_info, infos_list);
172 /*
173 * We move the ones we do to the end of the list, so we stop
174 * when we see one we have already done.
175 */
176 if (c->shrinker_run_no == run_no)
177 break;
178 if (!mutex_trylock(&c->umount_mutex)) {
179 /* Some un-mount is in progress, try next FS */
180 *contention = 1;
181 p = p->next;
182 continue;
183 }
184 /*
185 * We're holding 'c->umount_mutex', so the file-system won't go
186 * away.
187 */
188 if (!mutex_trylock(&c->tnc_mutex)) {
189 mutex_unlock(&c->umount_mutex);
190 *contention = 1;
191 p = p->next;
192 continue;
193 }
194 spin_unlock(&ubifs_infos_lock);
195 /*
196 * OK, now we have TNC locked, the file-system cannot go away -
197 * it is safe to reap the cache.
198 */
199 c->shrinker_run_no = run_no;
200 freed += shrink_tnc(c, nr, age, contention);
201 mutex_unlock(&c->tnc_mutex);
202 spin_lock(&ubifs_infos_lock);
203 /* Get the next list element before we move this one */
204 p = p->next;
205 /*
206 * Move this one to the end of the list to provide some
207 * fairness.
208 */
Eric Sesterhennec328162009-02-13 09:13:11 +0100209 list_move_tail(&c->infos_list, &ubifs_infos);
Artem Bityutskiy1e517642008-07-14 19:08:37 +0300210 mutex_unlock(&c->umount_mutex);
211 if (freed >= nr)
212 break;
213 }
214 spin_unlock(&ubifs_infos_lock);
215 return freed;
216}
217
218/**
219 * kick_a_thread - kick a background thread to start commit.
220 *
221 * This function kicks a background thread to start background commit. Returns
222 * %-1 if a thread was kicked or there is another reason to assume the memory
223 * will soon be freed or become freeable. If there are no dirty znodes, returns
224 * %0.
225 */
226static int kick_a_thread(void)
227{
228 int i;
229 struct ubifs_info *c;
230
231 /*
232 * Iterate over all mounted UBIFS file-systems and find out if there is
233 * already an ongoing commit operation there. If no, then iterate for
234 * the second time and initiate background commit.
235 */
236 spin_lock(&ubifs_infos_lock);
237 for (i = 0; i < 2; i++) {
238 list_for_each_entry(c, &ubifs_infos, infos_list) {
239 long dirty_zn_cnt;
240
241 if (!mutex_trylock(&c->umount_mutex)) {
242 /*
243 * Some un-mount is in progress, it will
244 * certainly free memory, so just return.
245 */
246 spin_unlock(&ubifs_infos_lock);
247 return -1;
248 }
249
250 dirty_zn_cnt = atomic_long_read(&c->dirty_zn_cnt);
251
252 if (!dirty_zn_cnt || c->cmt_state == COMMIT_BROKEN ||
Artem Bityutskiy2ef13292010-09-19 18:34:26 +0300253 c->ro_mount || c->ro_error) {
Artem Bityutskiy1e517642008-07-14 19:08:37 +0300254 mutex_unlock(&c->umount_mutex);
255 continue;
256 }
257
258 if (c->cmt_state != COMMIT_RESTING) {
259 spin_unlock(&ubifs_infos_lock);
260 mutex_unlock(&c->umount_mutex);
261 return -1;
262 }
263
264 if (i == 1) {
Eric Sesterhennec328162009-02-13 09:13:11 +0100265 list_move_tail(&c->infos_list, &ubifs_infos);
Artem Bityutskiy1e517642008-07-14 19:08:37 +0300266 spin_unlock(&ubifs_infos_lock);
267
268 ubifs_request_bg_commit(c);
269 mutex_unlock(&c->umount_mutex);
270 return -1;
271 }
272 mutex_unlock(&c->umount_mutex);
273 }
274 }
275 spin_unlock(&ubifs_infos_lock);
276
277 return 0;
278}
279
Al Viroef1d5752011-05-29 13:46:08 +0100280int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc)
Artem Bityutskiy1e517642008-07-14 19:08:37 +0300281{
Al Viroef1d5752011-05-29 13:46:08 +0100282 int nr = sc->nr_to_scan;
Artem Bityutskiy1e517642008-07-14 19:08:37 +0300283 int freed, contention = 0;
284 long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
285
286 if (nr == 0)
Artem Bityutskiycf610bf2011-05-31 07:03:21 +0300287 /*
288 * Due to the way UBIFS updates the clean znode counter it may
289 * temporarily be negative.
290 */
291 return clean_zn_cnt >= 0 ? clean_zn_cnt : 1;
Artem Bityutskiy1e517642008-07-14 19:08:37 +0300292
293 if (!clean_zn_cnt) {
294 /*
295 * No clean znodes, nothing to reap. All we can do in this case
296 * is to kick background threads to start commit, which will
297 * probably make clean znodes which, in turn, will be freeable.
298 * And we return -1 which means will make VM call us again
299 * later.
300 */
301 dbg_tnc("no clean znodes, kick a thread");
302 return kick_a_thread();
303 }
304
305 freed = shrink_tnc_trees(nr, OLD_ZNODE_AGE, &contention);
306 if (freed >= nr)
307 goto out;
308
309 dbg_tnc("not enough old znodes, try to free young ones");
310 freed += shrink_tnc_trees(nr - freed, YOUNG_ZNODE_AGE, &contention);
311 if (freed >= nr)
312 goto out;
313
314 dbg_tnc("not enough young znodes, free all");
315 freed += shrink_tnc_trees(nr - freed, 0, &contention);
316
317 if (!freed && contention) {
318 dbg_tnc("freed nothing, but contention");
319 return -1;
320 }
321
322out:
323 dbg_tnc("%d znodes were freed, requested %d", freed, nr);
324 return freed;
325}