Artem Bityutskiy | 1e51764 | 2008-07-14 19:08:37 +0300 | [diff] [blame] | 1 | /* |
| 2 | * This file is part of UBIFS. |
| 3 | * |
| 4 | * Copyright (C) 2006-2008 Nokia Corporation. |
| 5 | * |
| 6 | * This program is free software; you can redistribute it and/or modify it |
| 7 | * under the terms of the GNU General Public License version 2 as published by |
| 8 | * the Free Software Foundation. |
| 9 | * |
| 10 | * This program is distributed in the hope that it will be useful, but WITHOUT |
| 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for |
| 13 | * more details. |
| 14 | * |
| 15 | * You should have received a copy of the GNU General Public License along with |
| 16 | * this program; if not, write to the Free Software Foundation, Inc., 51 |
| 17 | * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA |
| 18 | * |
| 19 | * Authors: Artem Bityutskiy (Битюцкий Артём) |
| 20 | * Adrian Hunter |
| 21 | */ |
| 22 | |
| 23 | /* |
| 24 | * This file implements UBIFS shrinker which evicts clean znodes from the TNC |
| 25 | * tree when Linux VM needs more RAM. |
| 26 | * |
| 27 | * We do not implement any LRU lists to find oldest znodes to free because it |
| 28 | * would add additional overhead to the file system fast paths. So the shrinker |
| 29 | * just walks the TNC tree when searching for znodes to free. |
| 30 | * |
| 31 | * If the root of a TNC sub-tree is clean and old enough, then the children are |
| 32 | * also clean and old enough. So the shrinker walks the TNC in level order and |
| 33 | * dumps entire sub-trees. |
| 34 | * |
| 35 | * The age of znodes is just the time-stamp when they were last looked at. |
| 36 | * The current shrinker first tries to evict old znodes, then young ones. |
| 37 | * |
| 38 | * Since the shrinker is global, it has to protect against races with FS |
| 39 | * un-mounts, which is done by the 'ubifs_infos_lock' and 'c->umount_mutex'. |
| 40 | */ |
| 41 | |
| 42 | #include "ubifs.h" |
| 43 | |
| 44 | /* List of all UBIFS file-system instances */ |
| 45 | LIST_HEAD(ubifs_infos); |
| 46 | |
| 47 | /* |
| 48 | * We number each shrinker run and record the number on the ubifs_info structure |
| 49 | * so that we can easily work out which ubifs_info structures have already been |
| 50 | * done by the current run. |
| 51 | */ |
| 52 | static unsigned int shrinker_run_no; |
| 53 | |
| 54 | /* Protects 'ubifs_infos' list */ |
| 55 | DEFINE_SPINLOCK(ubifs_infos_lock); |
| 56 | |
| 57 | /* Global clean znode counter (for all mounted UBIFS instances) */ |
| 58 | atomic_long_t ubifs_clean_zn_cnt; |
| 59 | |
| 60 | /** |
| 61 | * shrink_tnc - shrink TNC tree. |
| 62 | * @c: UBIFS file-system description object |
| 63 | * @nr: number of znodes to free |
| 64 | * @age: the age of znodes to free |
| 65 | * @contention: if any contention, this is set to %1 |
| 66 | * |
| 67 | * This function traverses TNC tree and frees clean znodes. It does not free |
| 68 | * clean znodes which younger then @age. Returns number of freed znodes. |
| 69 | */ |
| 70 | static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention) |
| 71 | { |
| 72 | int total_freed = 0; |
| 73 | struct ubifs_znode *znode, *zprev; |
| 74 | int time = get_seconds(); |
| 75 | |
| 76 | ubifs_assert(mutex_is_locked(&c->umount_mutex)); |
| 77 | ubifs_assert(mutex_is_locked(&c->tnc_mutex)); |
| 78 | |
| 79 | if (!c->zroot.znode || atomic_long_read(&c->clean_zn_cnt) == 0) |
| 80 | return 0; |
| 81 | |
| 82 | /* |
| 83 | * Traverse the TNC tree in levelorder manner, so that it is possible |
| 84 | * to destroy large sub-trees. Indeed, if a znode is old, then all its |
| 85 | * children are older or of the same age. |
| 86 | * |
| 87 | * Note, we are holding 'c->tnc_mutex', so we do not have to lock the |
| 88 | * 'c->space_lock' when _reading_ 'c->clean_zn_cnt', because it is |
| 89 | * changed only when the 'c->tnc_mutex' is held. |
| 90 | */ |
| 91 | zprev = NULL; |
| 92 | znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL); |
| 93 | while (znode && total_freed < nr && |
| 94 | atomic_long_read(&c->clean_zn_cnt) > 0) { |
| 95 | int freed; |
| 96 | |
| 97 | /* |
| 98 | * If the znode is clean, but it is in the 'c->cnext' list, this |
| 99 | * means that this znode has just been written to flash as a |
| 100 | * part of commit and was marked clean. They will be removed |
| 101 | * from the list at end commit. We cannot change the list, |
| 102 | * because it is not protected by any mutex (design decision to |
| 103 | * make commit really independent and parallel to main I/O). So |
| 104 | * we just skip these znodes. |
| 105 | * |
| 106 | * Note, the 'clean_zn_cnt' counters are not updated until |
| 107 | * after the commit, so the UBIFS shrinker does not report |
| 108 | * the znodes which are in the 'c->cnext' list as freeable. |
| 109 | * |
| 110 | * Also note, if the root of a sub-tree is not in 'c->cnext', |
| 111 | * then the whole sub-tree is not in 'c->cnext' as well, so it |
| 112 | * is safe to dump whole sub-tree. |
| 113 | */ |
| 114 | |
| 115 | if (znode->cnext) { |
| 116 | /* |
| 117 | * Very soon these znodes will be removed from the list |
| 118 | * and become freeable. |
| 119 | */ |
| 120 | *contention = 1; |
| 121 | } else if (!ubifs_zn_dirty(znode) && |
| 122 | abs(time - znode->time) >= age) { |
| 123 | if (znode->parent) |
| 124 | znode->parent->zbranch[znode->iip].znode = NULL; |
| 125 | else |
| 126 | c->zroot.znode = NULL; |
| 127 | |
| 128 | freed = ubifs_destroy_tnc_subtree(znode); |
| 129 | atomic_long_sub(freed, &ubifs_clean_zn_cnt); |
| 130 | atomic_long_sub(freed, &c->clean_zn_cnt); |
| 131 | ubifs_assert(atomic_long_read(&c->clean_zn_cnt) >= 0); |
| 132 | total_freed += freed; |
| 133 | znode = zprev; |
| 134 | } |
| 135 | |
| 136 | if (unlikely(!c->zroot.znode)) |
| 137 | break; |
| 138 | |
| 139 | zprev = znode; |
| 140 | znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode); |
| 141 | cond_resched(); |
| 142 | } |
| 143 | |
| 144 | return total_freed; |
| 145 | } |
| 146 | |
| 147 | /** |
| 148 | * shrink_tnc_trees - shrink UBIFS TNC trees. |
| 149 | * @nr: number of znodes to free |
| 150 | * @age: the age of znodes to free |
| 151 | * @contention: if any contention, this is set to %1 |
| 152 | * |
| 153 | * This function walks the list of mounted UBIFS file-systems and frees clean |
Frederik Schwarzer | 025dfda | 2008-10-16 19:02:37 +0200 | [diff] [blame] | 154 | * znodes which are older than @age, until at least @nr znodes are freed. |
Artem Bityutskiy | 1e51764 | 2008-07-14 19:08:37 +0300 | [diff] [blame] | 155 | * Returns the number of freed znodes. |
| 156 | */ |
| 157 | static int shrink_tnc_trees(int nr, int age, int *contention) |
| 158 | { |
| 159 | struct ubifs_info *c; |
| 160 | struct list_head *p; |
| 161 | unsigned int run_no; |
| 162 | int freed = 0; |
| 163 | |
| 164 | spin_lock(&ubifs_infos_lock); |
| 165 | do { |
| 166 | run_no = ++shrinker_run_no; |
| 167 | } while (run_no == 0); |
| 168 | /* Iterate over all mounted UBIFS file-systems and try to shrink them */ |
| 169 | p = ubifs_infos.next; |
| 170 | while (p != &ubifs_infos) { |
| 171 | c = list_entry(p, struct ubifs_info, infos_list); |
| 172 | /* |
| 173 | * We move the ones we do to the end of the list, so we stop |
| 174 | * when we see one we have already done. |
| 175 | */ |
| 176 | if (c->shrinker_run_no == run_no) |
| 177 | break; |
| 178 | if (!mutex_trylock(&c->umount_mutex)) { |
| 179 | /* Some un-mount is in progress, try next FS */ |
| 180 | *contention = 1; |
| 181 | p = p->next; |
| 182 | continue; |
| 183 | } |
| 184 | /* |
| 185 | * We're holding 'c->umount_mutex', so the file-system won't go |
| 186 | * away. |
| 187 | */ |
| 188 | if (!mutex_trylock(&c->tnc_mutex)) { |
| 189 | mutex_unlock(&c->umount_mutex); |
| 190 | *contention = 1; |
| 191 | p = p->next; |
| 192 | continue; |
| 193 | } |
| 194 | spin_unlock(&ubifs_infos_lock); |
| 195 | /* |
| 196 | * OK, now we have TNC locked, the file-system cannot go away - |
| 197 | * it is safe to reap the cache. |
| 198 | */ |
| 199 | c->shrinker_run_no = run_no; |
| 200 | freed += shrink_tnc(c, nr, age, contention); |
| 201 | mutex_unlock(&c->tnc_mutex); |
| 202 | spin_lock(&ubifs_infos_lock); |
| 203 | /* Get the next list element before we move this one */ |
| 204 | p = p->next; |
| 205 | /* |
| 206 | * Move this one to the end of the list to provide some |
| 207 | * fairness. |
| 208 | */ |
| 209 | list_del(&c->infos_list); |
| 210 | list_add_tail(&c->infos_list, &ubifs_infos); |
| 211 | mutex_unlock(&c->umount_mutex); |
| 212 | if (freed >= nr) |
| 213 | break; |
| 214 | } |
| 215 | spin_unlock(&ubifs_infos_lock); |
| 216 | return freed; |
| 217 | } |
| 218 | |
| 219 | /** |
| 220 | * kick_a_thread - kick a background thread to start commit. |
| 221 | * |
| 222 | * This function kicks a background thread to start background commit. Returns |
| 223 | * %-1 if a thread was kicked or there is another reason to assume the memory |
| 224 | * will soon be freed or become freeable. If there are no dirty znodes, returns |
| 225 | * %0. |
| 226 | */ |
| 227 | static int kick_a_thread(void) |
| 228 | { |
| 229 | int i; |
| 230 | struct ubifs_info *c; |
| 231 | |
| 232 | /* |
| 233 | * Iterate over all mounted UBIFS file-systems and find out if there is |
| 234 | * already an ongoing commit operation there. If no, then iterate for |
| 235 | * the second time and initiate background commit. |
| 236 | */ |
| 237 | spin_lock(&ubifs_infos_lock); |
| 238 | for (i = 0; i < 2; i++) { |
| 239 | list_for_each_entry(c, &ubifs_infos, infos_list) { |
| 240 | long dirty_zn_cnt; |
| 241 | |
| 242 | if (!mutex_trylock(&c->umount_mutex)) { |
| 243 | /* |
| 244 | * Some un-mount is in progress, it will |
| 245 | * certainly free memory, so just return. |
| 246 | */ |
| 247 | spin_unlock(&ubifs_infos_lock); |
| 248 | return -1; |
| 249 | } |
| 250 | |
| 251 | dirty_zn_cnt = atomic_long_read(&c->dirty_zn_cnt); |
| 252 | |
| 253 | if (!dirty_zn_cnt || c->cmt_state == COMMIT_BROKEN || |
| 254 | c->ro_media) { |
| 255 | mutex_unlock(&c->umount_mutex); |
| 256 | continue; |
| 257 | } |
| 258 | |
| 259 | if (c->cmt_state != COMMIT_RESTING) { |
| 260 | spin_unlock(&ubifs_infos_lock); |
| 261 | mutex_unlock(&c->umount_mutex); |
| 262 | return -1; |
| 263 | } |
| 264 | |
| 265 | if (i == 1) { |
| 266 | list_del(&c->infos_list); |
| 267 | list_add_tail(&c->infos_list, &ubifs_infos); |
| 268 | spin_unlock(&ubifs_infos_lock); |
| 269 | |
| 270 | ubifs_request_bg_commit(c); |
| 271 | mutex_unlock(&c->umount_mutex); |
| 272 | return -1; |
| 273 | } |
| 274 | mutex_unlock(&c->umount_mutex); |
| 275 | } |
| 276 | } |
| 277 | spin_unlock(&ubifs_infos_lock); |
| 278 | |
| 279 | return 0; |
| 280 | } |
| 281 | |
| 282 | int ubifs_shrinker(int nr, gfp_t gfp_mask) |
| 283 | { |
| 284 | int freed, contention = 0; |
| 285 | long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt); |
| 286 | |
| 287 | if (nr == 0) |
| 288 | return clean_zn_cnt; |
| 289 | |
| 290 | if (!clean_zn_cnt) { |
| 291 | /* |
| 292 | * No clean znodes, nothing to reap. All we can do in this case |
| 293 | * is to kick background threads to start commit, which will |
| 294 | * probably make clean znodes which, in turn, will be freeable. |
| 295 | * And we return -1 which means will make VM call us again |
| 296 | * later. |
| 297 | */ |
| 298 | dbg_tnc("no clean znodes, kick a thread"); |
| 299 | return kick_a_thread(); |
| 300 | } |
| 301 | |
| 302 | freed = shrink_tnc_trees(nr, OLD_ZNODE_AGE, &contention); |
| 303 | if (freed >= nr) |
| 304 | goto out; |
| 305 | |
| 306 | dbg_tnc("not enough old znodes, try to free young ones"); |
| 307 | freed += shrink_tnc_trees(nr - freed, YOUNG_ZNODE_AGE, &contention); |
| 308 | if (freed >= nr) |
| 309 | goto out; |
| 310 | |
| 311 | dbg_tnc("not enough young znodes, free all"); |
| 312 | freed += shrink_tnc_trees(nr - freed, 0, &contention); |
| 313 | |
| 314 | if (!freed && contention) { |
| 315 | dbg_tnc("freed nothing, but contention"); |
| 316 | return -1; |
| 317 | } |
| 318 | |
| 319 | out: |
| 320 | dbg_tnc("%d znodes were freed, requested %d", freed, nr); |
| 321 | return freed; |
| 322 | } |