blob: 5f2fbf2054f2ca71acb097460e697b851bba9e7c [file] [log] [blame]
Chris Masond1310b22008-01-24 16:13:08 -05001#include <linux/bitops.h>
2#include <linux/slab.h>
3#include <linux/bio.h>
4#include <linux/mm.h>
5#include <linux/gfp.h>
6#include <linux/pagemap.h>
7#include <linux/page-flags.h>
8#include <linux/module.h>
9#include <linux/spinlock.h>
10#include <linux/blkdev.h>
11#include <linux/swap.h>
12#include <linux/version.h>
13#include <linux/writeback.h>
14#include <linux/pagevec.h>
15#include "extent_io.h"
16#include "extent_map.h"
17
18/* temporary define until extent_map moves out of btrfs */
19struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
20 unsigned long extra_flags,
21 void (*ctor)(void *, struct kmem_cache *,
22 unsigned long));
23
24static struct kmem_cache *extent_state_cache;
25static struct kmem_cache *extent_buffer_cache;
26
27static LIST_HEAD(buffers);
28static LIST_HEAD(states);
29
Chris Masond1310b22008-01-24 16:13:08 -050030#define BUFFER_LRU_MAX 64
31
32struct tree_entry {
33 u64 start;
34 u64 end;
Chris Masond1310b22008-01-24 16:13:08 -050035 struct rb_node rb_node;
36};
37
38struct extent_page_data {
39 struct bio *bio;
40 struct extent_io_tree *tree;
41 get_extent_t *get_extent;
42};
43
44int __init extent_io_init(void)
45{
46 extent_state_cache = btrfs_cache_create("extent_state",
47 sizeof(struct extent_state), 0,
48 NULL);
49 if (!extent_state_cache)
50 return -ENOMEM;
51
52 extent_buffer_cache = btrfs_cache_create("extent_buffers",
53 sizeof(struct extent_buffer), 0,
54 NULL);
55 if (!extent_buffer_cache)
56 goto free_state_cache;
57 return 0;
58
59free_state_cache:
60 kmem_cache_destroy(extent_state_cache);
61 return -ENOMEM;
62}
63
64void extent_io_exit(void)
65{
66 struct extent_state *state;
67
68 while (!list_empty(&states)) {
69 state = list_entry(states.next, struct extent_state, list);
Chris Mason70dec802008-01-29 09:59:12 -050070 printk("state leak: start %Lu end %Lu state %lu in tree %p refs %d\n", state->start, state->end, state->state, state->tree, atomic_read(&state->refs));
Chris Masond1310b22008-01-24 16:13:08 -050071 list_del(&state->list);
72 kmem_cache_free(extent_state_cache, state);
73
74 }
75
76 if (extent_state_cache)
77 kmem_cache_destroy(extent_state_cache);
78 if (extent_buffer_cache)
79 kmem_cache_destroy(extent_buffer_cache);
80}
81
82void extent_io_tree_init(struct extent_io_tree *tree,
83 struct address_space *mapping, gfp_t mask)
84{
85 tree->state.rb_node = NULL;
86 tree->ops = NULL;
87 tree->dirty_bytes = 0;
Chris Mason70dec802008-01-29 09:59:12 -050088 spin_lock_init(&tree->lock);
Chris Masond1310b22008-01-24 16:13:08 -050089 spin_lock_init(&tree->lru_lock);
90 tree->mapping = mapping;
91 INIT_LIST_HEAD(&tree->buffer_lru);
92 tree->lru_size = 0;
Chris Mason80ea96b2008-02-01 14:51:59 -050093 tree->last = NULL;
Chris Masond1310b22008-01-24 16:13:08 -050094}
95EXPORT_SYMBOL(extent_io_tree_init);
96
97void extent_io_tree_empty_lru(struct extent_io_tree *tree)
98{
99 struct extent_buffer *eb;
100 while(!list_empty(&tree->buffer_lru)) {
101 eb = list_entry(tree->buffer_lru.next, struct extent_buffer,
102 lru);
103 list_del_init(&eb->lru);
104 free_extent_buffer(eb);
105 }
106}
107EXPORT_SYMBOL(extent_io_tree_empty_lru);
108
109struct extent_state *alloc_extent_state(gfp_t mask)
110{
111 struct extent_state *state;
Chris Masond1310b22008-01-24 16:13:08 -0500112
113 state = kmem_cache_alloc(extent_state_cache, mask);
114 if (!state || IS_ERR(state))
115 return state;
116 state->state = 0;
Chris Masond1310b22008-01-24 16:13:08 -0500117 state->private = 0;
Chris Mason70dec802008-01-29 09:59:12 -0500118 state->tree = NULL;
Chris Masond1310b22008-01-24 16:13:08 -0500119
120 atomic_set(&state->refs, 1);
121 init_waitqueue_head(&state->wq);
122 return state;
123}
124EXPORT_SYMBOL(alloc_extent_state);
125
126void free_extent_state(struct extent_state *state)
127{
Chris Masond1310b22008-01-24 16:13:08 -0500128 if (!state)
129 return;
130 if (atomic_dec_and_test(&state->refs)) {
Chris Mason70dec802008-01-29 09:59:12 -0500131 WARN_ON(state->tree);
Chris Masond1310b22008-01-24 16:13:08 -0500132 kmem_cache_free(extent_state_cache, state);
133 }
134}
135EXPORT_SYMBOL(free_extent_state);
136
137static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
138 struct rb_node *node)
139{
140 struct rb_node ** p = &root->rb_node;
141 struct rb_node * parent = NULL;
142 struct tree_entry *entry;
143
144 while(*p) {
145 parent = *p;
146 entry = rb_entry(parent, struct tree_entry, rb_node);
147
148 if (offset < entry->start)
149 p = &(*p)->rb_left;
150 else if (offset > entry->end)
151 p = &(*p)->rb_right;
152 else
153 return parent;
154 }
155
156 entry = rb_entry(node, struct tree_entry, rb_node);
Chris Masond1310b22008-01-24 16:13:08 -0500157 rb_link_node(node, parent, p);
158 rb_insert_color(node, root);
159 return NULL;
160}
161
Chris Mason80ea96b2008-02-01 14:51:59 -0500162static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
Chris Masond1310b22008-01-24 16:13:08 -0500163 struct rb_node **prev_ret,
164 struct rb_node **next_ret)
165{
Chris Mason80ea96b2008-02-01 14:51:59 -0500166 struct rb_root *root = &tree->state;
Chris Masond1310b22008-01-24 16:13:08 -0500167 struct rb_node * n = root->rb_node;
168 struct rb_node *prev = NULL;
169 struct rb_node *orig_prev = NULL;
170 struct tree_entry *entry;
171 struct tree_entry *prev_entry = NULL;
172
Chris Mason80ea96b2008-02-01 14:51:59 -0500173 if (tree->last) {
174 struct extent_state *state;
175 state = tree->last;
176 if (state->start <= offset && offset <= state->end)
177 return &tree->last->rb_node;
178 }
Chris Masond1310b22008-01-24 16:13:08 -0500179 while(n) {
180 entry = rb_entry(n, struct tree_entry, rb_node);
181 prev = n;
182 prev_entry = entry;
183
184 if (offset < entry->start)
185 n = n->rb_left;
186 else if (offset > entry->end)
187 n = n->rb_right;
Chris Mason80ea96b2008-02-01 14:51:59 -0500188 else {
189 tree->last = rb_entry(n, struct extent_state, rb_node);
Chris Masond1310b22008-01-24 16:13:08 -0500190 return n;
Chris Mason80ea96b2008-02-01 14:51:59 -0500191 }
Chris Masond1310b22008-01-24 16:13:08 -0500192 }
193
194 if (prev_ret) {
195 orig_prev = prev;
196 while(prev && offset > prev_entry->end) {
197 prev = rb_next(prev);
198 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
199 }
200 *prev_ret = prev;
201 prev = orig_prev;
202 }
203
204 if (next_ret) {
205 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
206 while(prev && offset < prev_entry->start) {
207 prev = rb_prev(prev);
208 prev_entry = rb_entry(prev, struct tree_entry, rb_node);
209 }
210 *next_ret = prev;
211 }
212 return NULL;
213}
214
Chris Mason80ea96b2008-02-01 14:51:59 -0500215static inline struct rb_node *tree_search(struct extent_io_tree *tree,
216 u64 offset)
Chris Masond1310b22008-01-24 16:13:08 -0500217{
Chris Mason70dec802008-01-29 09:59:12 -0500218 struct rb_node *prev = NULL;
Chris Masond1310b22008-01-24 16:13:08 -0500219 struct rb_node *ret;
Chris Mason70dec802008-01-29 09:59:12 -0500220
Chris Mason80ea96b2008-02-01 14:51:59 -0500221 ret = __etree_search(tree, offset, &prev, NULL);
222 if (!ret) {
223 if (prev) {
224 tree->last = rb_entry(prev, struct extent_state,
225 rb_node);
226 }
Chris Masond1310b22008-01-24 16:13:08 -0500227 return prev;
Chris Mason80ea96b2008-02-01 14:51:59 -0500228 }
Chris Masond1310b22008-01-24 16:13:08 -0500229 return ret;
230}
231
232/*
233 * utility function to look for merge candidates inside a given range.
234 * Any extents with matching state are merged together into a single
235 * extent in the tree. Extents with EXTENT_IO in their state field
236 * are not merged because the end_io handlers need to be able to do
237 * operations on them without sleeping (or doing allocations/splits).
238 *
239 * This should be called with the tree lock held.
240 */
241static int merge_state(struct extent_io_tree *tree,
242 struct extent_state *state)
243{
244 struct extent_state *other;
245 struct rb_node *other_node;
246
247 if (state->state & EXTENT_IOBITS)
248 return 0;
249
250 other_node = rb_prev(&state->rb_node);
251 if (other_node) {
252 other = rb_entry(other_node, struct extent_state, rb_node);
253 if (other->end == state->start - 1 &&
254 other->state == state->state) {
255 state->start = other->start;
Chris Mason70dec802008-01-29 09:59:12 -0500256 other->tree = NULL;
Chris Mason80ea96b2008-02-01 14:51:59 -0500257 if (tree->last == other)
258 tree->last = NULL;
Chris Masond1310b22008-01-24 16:13:08 -0500259 rb_erase(&other->rb_node, &tree->state);
260 free_extent_state(other);
261 }
262 }
263 other_node = rb_next(&state->rb_node);
264 if (other_node) {
265 other = rb_entry(other_node, struct extent_state, rb_node);
266 if (other->start == state->end + 1 &&
267 other->state == state->state) {
268 other->start = state->start;
Chris Mason70dec802008-01-29 09:59:12 -0500269 state->tree = NULL;
Chris Mason80ea96b2008-02-01 14:51:59 -0500270 if (tree->last == state)
271 tree->last = NULL;
Chris Masond1310b22008-01-24 16:13:08 -0500272 rb_erase(&state->rb_node, &tree->state);
273 free_extent_state(state);
274 }
275 }
276 return 0;
277}
278
Chris Mason291d6732008-01-29 15:55:23 -0500279static void set_state_cb(struct extent_io_tree *tree,
280 struct extent_state *state,
281 unsigned long bits)
282{
283 if (tree->ops && tree->ops->set_bit_hook) {
284 tree->ops->set_bit_hook(tree->mapping->host, state->start,
Chris Masonb0c68f82008-01-31 11:05:37 -0500285 state->end, state->state, bits);
Chris Mason291d6732008-01-29 15:55:23 -0500286 }
287}
288
289static void clear_state_cb(struct extent_io_tree *tree,
290 struct extent_state *state,
291 unsigned long bits)
292{
293 if (tree->ops && tree->ops->set_bit_hook) {
294 tree->ops->clear_bit_hook(tree->mapping->host, state->start,
Chris Masonb0c68f82008-01-31 11:05:37 -0500295 state->end, state->state, bits);
Chris Mason291d6732008-01-29 15:55:23 -0500296 }
297}
298
Chris Masond1310b22008-01-24 16:13:08 -0500299/*
300 * insert an extent_state struct into the tree. 'bits' are set on the
301 * struct before it is inserted.
302 *
303 * This may return -EEXIST if the extent is already there, in which case the
304 * state struct is freed.
305 *
306 * The tree lock is not taken internally. This is a utility function and
307 * probably isn't what you want to call (see set/clear_extent_bit).
308 */
309static int insert_state(struct extent_io_tree *tree,
310 struct extent_state *state, u64 start, u64 end,
311 int bits)
312{
313 struct rb_node *node;
314
315 if (end < start) {
316 printk("end < start %Lu %Lu\n", end, start);
317 WARN_ON(1);
318 }
319 if (bits & EXTENT_DIRTY)
320 tree->dirty_bytes += end - start + 1;
Chris Masonb0c68f82008-01-31 11:05:37 -0500321 set_state_cb(tree, state, bits);
Chris Masond1310b22008-01-24 16:13:08 -0500322 state->state |= bits;
323 state->start = start;
324 state->end = end;
325 node = tree_insert(&tree->state, end, &state->rb_node);
326 if (node) {
327 struct extent_state *found;
328 found = rb_entry(node, struct extent_state, rb_node);
329 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end);
330 free_extent_state(state);
331 return -EEXIST;
332 }
Chris Mason70dec802008-01-29 09:59:12 -0500333 state->tree = tree;
Chris Mason80ea96b2008-02-01 14:51:59 -0500334 tree->last = state;
Chris Masond1310b22008-01-24 16:13:08 -0500335 merge_state(tree, state);
336 return 0;
337}
338
339/*
340 * split a given extent state struct in two, inserting the preallocated
341 * struct 'prealloc' as the newly created second half. 'split' indicates an
342 * offset inside 'orig' where it should be split.
343 *
344 * Before calling,
345 * the tree has 'orig' at [orig->start, orig->end]. After calling, there
346 * are two extent state structs in the tree:
347 * prealloc: [orig->start, split - 1]
348 * orig: [ split, orig->end ]
349 *
350 * The tree locks are not taken by this function. They need to be held
351 * by the caller.
352 */
353static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
354 struct extent_state *prealloc, u64 split)
355{
356 struct rb_node *node;
357 prealloc->start = orig->start;
358 prealloc->end = split - 1;
359 prealloc->state = orig->state;
360 orig->start = split;
361
362 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
363 if (node) {
364 struct extent_state *found;
365 found = rb_entry(node, struct extent_state, rb_node);
366 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end);
367 free_extent_state(prealloc);
368 return -EEXIST;
369 }
Chris Mason70dec802008-01-29 09:59:12 -0500370 prealloc->tree = tree;
Chris Masond1310b22008-01-24 16:13:08 -0500371 return 0;
372}
373
374/*
375 * utility function to clear some bits in an extent state struct.
376 * it will optionally wake up any one waiting on this state (wake == 1), or
377 * forcibly remove the state from the tree (delete == 1).
378 *
379 * If no bits are set on the state struct after clearing things, the
380 * struct is freed and removed from the tree
381 */
382static int clear_state_bit(struct extent_io_tree *tree,
383 struct extent_state *state, int bits, int wake,
384 int delete)
385{
386 int ret = state->state & bits;
387
388 if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
389 u64 range = state->end - state->start + 1;
390 WARN_ON(range > tree->dirty_bytes);
391 tree->dirty_bytes -= range;
392 }
Chris Mason291d6732008-01-29 15:55:23 -0500393 clear_state_cb(tree, state, bits);
Chris Masonb0c68f82008-01-31 11:05:37 -0500394 state->state &= ~bits;
Chris Masond1310b22008-01-24 16:13:08 -0500395 if (wake)
396 wake_up(&state->wq);
397 if (delete || state->state == 0) {
Chris Mason70dec802008-01-29 09:59:12 -0500398 if (state->tree) {
Chris Mason80ea96b2008-02-01 14:51:59 -0500399 if (tree->last == state)
400 tree->last = NULL;
Chris Masond1310b22008-01-24 16:13:08 -0500401 rb_erase(&state->rb_node, &tree->state);
Chris Mason70dec802008-01-29 09:59:12 -0500402 state->tree = NULL;
Chris Masond1310b22008-01-24 16:13:08 -0500403 free_extent_state(state);
404 } else {
405 WARN_ON(1);
406 }
407 } else {
408 merge_state(tree, state);
409 }
410 return ret;
411}
412
413/*
414 * clear some bits on a range in the tree. This may require splitting
415 * or inserting elements in the tree, so the gfp mask is used to
416 * indicate which allocations or sleeping are allowed.
417 *
418 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
419 * the given range from the tree regardless of state (ie for truncate).
420 *
421 * the range [start, end] is inclusive.
422 *
423 * This takes the tree lock, and returns < 0 on error, > 0 if any of the
424 * bits were already set, or zero if none of the bits were already set.
425 */
426int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
427 int bits, int wake, int delete, gfp_t mask)
428{
429 struct extent_state *state;
430 struct extent_state *prealloc = NULL;
431 struct rb_node *node;
432 unsigned long flags;
433 int err;
434 int set = 0;
435
436again:
437 if (!prealloc && (mask & __GFP_WAIT)) {
438 prealloc = alloc_extent_state(mask);
439 if (!prealloc)
440 return -ENOMEM;
441 }
442
Chris Mason70dec802008-01-29 09:59:12 -0500443 spin_lock_irqsave(&tree->lock, flags);
Chris Masond1310b22008-01-24 16:13:08 -0500444 /*
445 * this search will find the extents that end after
446 * our range starts
447 */
Chris Mason80ea96b2008-02-01 14:51:59 -0500448 node = tree_search(tree, start);
Chris Masond1310b22008-01-24 16:13:08 -0500449 if (!node)
450 goto out;
451 state = rb_entry(node, struct extent_state, rb_node);
452 if (state->start > end)
453 goto out;
454 WARN_ON(state->end < start);
455
456 /*
457 * | ---- desired range ---- |
458 * | state | or
459 * | ------------- state -------------- |
460 *
461 * We need to split the extent we found, and may flip
462 * bits on second half.
463 *
464 * If the extent we found extends past our range, we
465 * just split and search again. It'll get split again
466 * the next time though.
467 *
468 * If the extent we found is inside our range, we clear
469 * the desired bit on it.
470 */
471
472 if (state->start < start) {
Chris Mason70dec802008-01-29 09:59:12 -0500473 if (!prealloc)
474 prealloc = alloc_extent_state(GFP_ATOMIC);
Chris Masond1310b22008-01-24 16:13:08 -0500475 err = split_state(tree, state, prealloc, start);
476 BUG_ON(err == -EEXIST);
477 prealloc = NULL;
478 if (err)
479 goto out;
480 if (state->end <= end) {
481 start = state->end + 1;
482 set |= clear_state_bit(tree, state, bits,
483 wake, delete);
484 } else {
485 start = state->start;
486 }
487 goto search_again;
488 }
489 /*
490 * | ---- desired range ---- |
491 * | state |
492 * We need to split the extent, and clear the bit
493 * on the first half
494 */
495 if (state->start <= end && state->end > end) {
Chris Mason70dec802008-01-29 09:59:12 -0500496 if (!prealloc)
497 prealloc = alloc_extent_state(GFP_ATOMIC);
Chris Masond1310b22008-01-24 16:13:08 -0500498 err = split_state(tree, state, prealloc, end + 1);
499 BUG_ON(err == -EEXIST);
500
501 if (wake)
502 wake_up(&state->wq);
503 set |= clear_state_bit(tree, prealloc, bits,
504 wake, delete);
505 prealloc = NULL;
506 goto out;
507 }
508
509 start = state->end + 1;
510 set |= clear_state_bit(tree, state, bits, wake, delete);
511 goto search_again;
512
513out:
Chris Mason70dec802008-01-29 09:59:12 -0500514 spin_unlock_irqrestore(&tree->lock, flags);
Chris Masond1310b22008-01-24 16:13:08 -0500515 if (prealloc)
516 free_extent_state(prealloc);
517
518 return set;
519
520search_again:
521 if (start > end)
522 goto out;
Chris Mason70dec802008-01-29 09:59:12 -0500523 spin_unlock_irqrestore(&tree->lock, flags);
Chris Masond1310b22008-01-24 16:13:08 -0500524 if (mask & __GFP_WAIT)
525 cond_resched();
526 goto again;
527}
528EXPORT_SYMBOL(clear_extent_bit);
529
530static int wait_on_state(struct extent_io_tree *tree,
531 struct extent_state *state)
532{
533 DEFINE_WAIT(wait);
534 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
Chris Mason70dec802008-01-29 09:59:12 -0500535 spin_unlock_irq(&tree->lock);
Chris Masond1310b22008-01-24 16:13:08 -0500536 schedule();
Chris Mason70dec802008-01-29 09:59:12 -0500537 spin_lock_irq(&tree->lock);
Chris Masond1310b22008-01-24 16:13:08 -0500538 finish_wait(&state->wq, &wait);
539 return 0;
540}
541
542/*
543 * waits for one or more bits to clear on a range in the state tree.
544 * The range [start, end] is inclusive.
545 * The tree lock is taken by this function
546 */
547int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
548{
549 struct extent_state *state;
550 struct rb_node *node;
551
Chris Mason70dec802008-01-29 09:59:12 -0500552 spin_lock_irq(&tree->lock);
Chris Masond1310b22008-01-24 16:13:08 -0500553again:
554 while (1) {
555 /*
556 * this search will find all the extents that end after
557 * our range starts
558 */
Chris Mason80ea96b2008-02-01 14:51:59 -0500559 node = tree_search(tree, start);
Chris Masond1310b22008-01-24 16:13:08 -0500560 if (!node)
561 break;
562
563 state = rb_entry(node, struct extent_state, rb_node);
564
565 if (state->start > end)
566 goto out;
567
568 if (state->state & bits) {
569 start = state->start;
570 atomic_inc(&state->refs);
571 wait_on_state(tree, state);
572 free_extent_state(state);
573 goto again;
574 }
575 start = state->end + 1;
576
577 if (start > end)
578 break;
579
580 if (need_resched()) {
Chris Mason70dec802008-01-29 09:59:12 -0500581 spin_unlock_irq(&tree->lock);
Chris Masond1310b22008-01-24 16:13:08 -0500582 cond_resched();
Chris Mason70dec802008-01-29 09:59:12 -0500583 spin_lock_irq(&tree->lock);
Chris Masond1310b22008-01-24 16:13:08 -0500584 }
585 }
586out:
Chris Mason70dec802008-01-29 09:59:12 -0500587 spin_unlock_irq(&tree->lock);
Chris Masond1310b22008-01-24 16:13:08 -0500588 return 0;
589}
590EXPORT_SYMBOL(wait_extent_bit);
591
592static void set_state_bits(struct extent_io_tree *tree,
593 struct extent_state *state,
594 int bits)
595{
596 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
597 u64 range = state->end - state->start + 1;
598 tree->dirty_bytes += range;
599 }
Chris Mason291d6732008-01-29 15:55:23 -0500600 set_state_cb(tree, state, bits);
Chris Masonb0c68f82008-01-31 11:05:37 -0500601 state->state |= bits;
Chris Masond1310b22008-01-24 16:13:08 -0500602}
603
604/*
605 * set some bits on a range in the tree. This may require allocations
606 * or sleeping, so the gfp mask is used to indicate what is allowed.
607 *
608 * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
609 * range already has the desired bits set. The start of the existing
610 * range is returned in failed_start in this case.
611 *
612 * [start, end] is inclusive
613 * This takes the tree lock.
614 */
615int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
616 int exclusive, u64 *failed_start, gfp_t mask)
617{
618 struct extent_state *state;
619 struct extent_state *prealloc = NULL;
620 struct rb_node *node;
621 unsigned long flags;
622 int err = 0;
623 int set;
624 u64 last_start;
625 u64 last_end;
626again:
627 if (!prealloc && (mask & __GFP_WAIT)) {
628 prealloc = alloc_extent_state(mask);
629 if (!prealloc)
630 return -ENOMEM;
631 }
632
Chris Mason70dec802008-01-29 09:59:12 -0500633 spin_lock_irqsave(&tree->lock, flags);
Chris Masond1310b22008-01-24 16:13:08 -0500634 /*
635 * this search will find all the extents that end after
636 * our range starts.
637 */
Chris Mason80ea96b2008-02-01 14:51:59 -0500638 node = tree_search(tree, start);
Chris Masond1310b22008-01-24 16:13:08 -0500639 if (!node) {
640 err = insert_state(tree, prealloc, start, end, bits);
641 prealloc = NULL;
642 BUG_ON(err == -EEXIST);
643 goto out;
644 }
645
646 state = rb_entry(node, struct extent_state, rb_node);
647 last_start = state->start;
648 last_end = state->end;
649
650 /*
651 * | ---- desired range ---- |
652 * | state |
653 *
654 * Just lock what we found and keep going
655 */
656 if (state->start == start && state->end <= end) {
657 set = state->state & bits;
658 if (set && exclusive) {
659 *failed_start = state->start;
660 err = -EEXIST;
661 goto out;
662 }
663 set_state_bits(tree, state, bits);
664 start = state->end + 1;
665 merge_state(tree, state);
666 goto search_again;
667 }
668
669 /*
670 * | ---- desired range ---- |
671 * | state |
672 * or
673 * | ------------- state -------------- |
674 *
675 * We need to split the extent we found, and may flip bits on
676 * second half.
677 *
678 * If the extent we found extends past our
679 * range, we just split and search again. It'll get split
680 * again the next time though.
681 *
682 * If the extent we found is inside our range, we set the
683 * desired bit on it.
684 */
685 if (state->start < start) {
686 set = state->state & bits;
687 if (exclusive && set) {
688 *failed_start = start;
689 err = -EEXIST;
690 goto out;
691 }
692 err = split_state(tree, state, prealloc, start);
693 BUG_ON(err == -EEXIST);
694 prealloc = NULL;
695 if (err)
696 goto out;
697 if (state->end <= end) {
698 set_state_bits(tree, state, bits);
699 start = state->end + 1;
700 merge_state(tree, state);
701 } else {
702 start = state->start;
703 }
704 goto search_again;
705 }
706 /*
707 * | ---- desired range ---- |
708 * | state | or | state |
709 *
710 * There's a hole, we need to insert something in it and
711 * ignore the extent we found.
712 */
713 if (state->start > start) {
714 u64 this_end;
715 if (end < last_start)
716 this_end = end;
717 else
718 this_end = last_start -1;
719 err = insert_state(tree, prealloc, start, this_end,
720 bits);
721 prealloc = NULL;
722 BUG_ON(err == -EEXIST);
723 if (err)
724 goto out;
725 start = this_end + 1;
726 goto search_again;
727 }
728 /*
729 * | ---- desired range ---- |
730 * | state |
731 * We need to split the extent, and set the bit
732 * on the first half
733 */
734 if (state->start <= end && state->end > end) {
735 set = state->state & bits;
736 if (exclusive && set) {
737 *failed_start = start;
738 err = -EEXIST;
739 goto out;
740 }
741 err = split_state(tree, state, prealloc, end + 1);
742 BUG_ON(err == -EEXIST);
743
744 set_state_bits(tree, prealloc, bits);
745 merge_state(tree, prealloc);
746 prealloc = NULL;
747 goto out;
748 }
749
750 goto search_again;
751
752out:
Chris Mason70dec802008-01-29 09:59:12 -0500753 spin_unlock_irqrestore(&tree->lock, flags);
Chris Masond1310b22008-01-24 16:13:08 -0500754 if (prealloc)
755 free_extent_state(prealloc);
756
757 return err;
758
759search_again:
760 if (start > end)
761 goto out;
Chris Mason70dec802008-01-29 09:59:12 -0500762 spin_unlock_irqrestore(&tree->lock, flags);
Chris Masond1310b22008-01-24 16:13:08 -0500763 if (mask & __GFP_WAIT)
764 cond_resched();
765 goto again;
766}
767EXPORT_SYMBOL(set_extent_bit);
768
769/* wrappers around set/clear extent bit */
770int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
771 gfp_t mask)
772{
773 return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
774 mask);
775}
776EXPORT_SYMBOL(set_extent_dirty);
777
778int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
779 int bits, gfp_t mask)
780{
781 return set_extent_bit(tree, start, end, bits, 0, NULL,
782 mask);
783}
784EXPORT_SYMBOL(set_extent_bits);
785
786int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
787 int bits, gfp_t mask)
788{
789 return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
790}
791EXPORT_SYMBOL(clear_extent_bits);
792
793int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
794 gfp_t mask)
795{
796 return set_extent_bit(tree, start, end,
797 EXTENT_DELALLOC | EXTENT_DIRTY, 0, NULL,
798 mask);
799}
800EXPORT_SYMBOL(set_extent_delalloc);
801
802int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
803 gfp_t mask)
804{
805 return clear_extent_bit(tree, start, end,
806 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
807}
808EXPORT_SYMBOL(clear_extent_dirty);
809
810int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
811 gfp_t mask)
812{
813 return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
814 mask);
815}
816EXPORT_SYMBOL(set_extent_new);
817
818int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
819 gfp_t mask)
820{
821 return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
822}
823EXPORT_SYMBOL(clear_extent_new);
824
825int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
826 gfp_t mask)
827{
828 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
829 mask);
830}
831EXPORT_SYMBOL(set_extent_uptodate);
832
833int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
834 gfp_t mask)
835{
836 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
837}
838EXPORT_SYMBOL(clear_extent_uptodate);
839
840int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
841 gfp_t mask)
842{
843 return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
844 0, NULL, mask);
845}
846EXPORT_SYMBOL(set_extent_writeback);
847
848int clear_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
849 gfp_t mask)
850{
851 return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
852}
853EXPORT_SYMBOL(clear_extent_writeback);
854
855int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
856{
857 return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
858}
859EXPORT_SYMBOL(wait_on_extent_writeback);
860
Chris Masond1310b22008-01-24 16:13:08 -0500861int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
862{
863 int err;
864 u64 failed_start;
865 while (1) {
866 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
867 &failed_start, mask);
868 if (err == -EEXIST && (mask & __GFP_WAIT)) {
869 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
870 start = failed_start;
871 } else {
872 break;
873 }
874 WARN_ON(start > end);
875 }
876 return err;
877}
878EXPORT_SYMBOL(lock_extent);
879
880int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
881 gfp_t mask)
882{
883 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
884}
885EXPORT_SYMBOL(unlock_extent);
886
887/*
888 * helper function to set pages and extents in the tree dirty
889 */
890int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
891{
892 unsigned long index = start >> PAGE_CACHE_SHIFT;
893 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
894 struct page *page;
895
896 while (index <= end_index) {
897 page = find_get_page(tree->mapping, index);
898 BUG_ON(!page);
899 __set_page_dirty_nobuffers(page);
900 page_cache_release(page);
901 index++;
902 }
903 set_extent_dirty(tree, start, end, GFP_NOFS);
904 return 0;
905}
906EXPORT_SYMBOL(set_range_dirty);
907
908/*
909 * helper function to set both pages and extents in the tree writeback
910 */
911int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
912{
913 unsigned long index = start >> PAGE_CACHE_SHIFT;
914 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
915 struct page *page;
916
917 while (index <= end_index) {
918 page = find_get_page(tree->mapping, index);
919 BUG_ON(!page);
920 set_page_writeback(page);
921 page_cache_release(page);
922 index++;
923 }
924 set_extent_writeback(tree, start, end, GFP_NOFS);
925 return 0;
926}
927EXPORT_SYMBOL(set_range_writeback);
928
929int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
930 u64 *start_ret, u64 *end_ret, int bits)
931{
932 struct rb_node *node;
933 struct extent_state *state;
934 int ret = 1;
935
Chris Mason70dec802008-01-29 09:59:12 -0500936 spin_lock_irq(&tree->lock);
Chris Masond1310b22008-01-24 16:13:08 -0500937 /*
938 * this search will find all the extents that end after
939 * our range starts.
940 */
Chris Mason80ea96b2008-02-01 14:51:59 -0500941 node = tree_search(tree, start);
Chris Masond1310b22008-01-24 16:13:08 -0500942 if (!node || IS_ERR(node)) {
943 goto out;
944 }
945
946 while(1) {
947 state = rb_entry(node, struct extent_state, rb_node);
948 if (state->end >= start && (state->state & bits)) {
949 *start_ret = state->start;
950 *end_ret = state->end;
951 ret = 0;
952 break;
953 }
954 node = rb_next(node);
955 if (!node)
956 break;
957 }
958out:
Chris Mason70dec802008-01-29 09:59:12 -0500959 spin_unlock_irq(&tree->lock);
Chris Masond1310b22008-01-24 16:13:08 -0500960 return ret;
961}
962EXPORT_SYMBOL(find_first_extent_bit);
963
964u64 find_lock_delalloc_range(struct extent_io_tree *tree,
965 u64 *start, u64 *end, u64 max_bytes)
966{
967 struct rb_node *node;
968 struct extent_state *state;
969 u64 cur_start = *start;
970 u64 found = 0;
971 u64 total_bytes = 0;
972
Chris Mason70dec802008-01-29 09:59:12 -0500973 spin_lock_irq(&tree->lock);
Chris Masond1310b22008-01-24 16:13:08 -0500974 /*
975 * this search will find all the extents that end after
976 * our range starts.
977 */
978search_again:
Chris Mason80ea96b2008-02-01 14:51:59 -0500979 node = tree_search(tree, cur_start);
Chris Masond1310b22008-01-24 16:13:08 -0500980 if (!node || IS_ERR(node)) {
981 *end = (u64)-1;
982 goto out;
983 }
984
985 while(1) {
986 state = rb_entry(node, struct extent_state, rb_node);
987 if (found && state->start != cur_start) {
988 goto out;
989 }
990 if (!(state->state & EXTENT_DELALLOC)) {
991 if (!found)
992 *end = state->end;
993 goto out;
994 }
995 if (!found) {
996 struct extent_state *prev_state;
997 struct rb_node *prev_node = node;
998 while(1) {
999 prev_node = rb_prev(prev_node);
1000 if (!prev_node)
1001 break;
1002 prev_state = rb_entry(prev_node,
1003 struct extent_state,
1004 rb_node);
1005 if (!(prev_state->state & EXTENT_DELALLOC))
1006 break;
1007 state = prev_state;
1008 node = prev_node;
1009 }
1010 }
1011 if (state->state & EXTENT_LOCKED) {
1012 DEFINE_WAIT(wait);
1013 atomic_inc(&state->refs);
1014 prepare_to_wait(&state->wq, &wait,
1015 TASK_UNINTERRUPTIBLE);
Chris Mason70dec802008-01-29 09:59:12 -05001016 spin_unlock_irq(&tree->lock);
Chris Masond1310b22008-01-24 16:13:08 -05001017 schedule();
Chris Mason70dec802008-01-29 09:59:12 -05001018 spin_lock_irq(&tree->lock);
Chris Masond1310b22008-01-24 16:13:08 -05001019 finish_wait(&state->wq, &wait);
1020 free_extent_state(state);
1021 goto search_again;
1022 }
Chris Mason291d6732008-01-29 15:55:23 -05001023 set_state_cb(tree, state, EXTENT_LOCKED);
Chris Masonb0c68f82008-01-31 11:05:37 -05001024 state->state |= EXTENT_LOCKED;
Chris Masond1310b22008-01-24 16:13:08 -05001025 if (!found)
1026 *start = state->start;
1027 found++;
1028 *end = state->end;
1029 cur_start = state->end + 1;
1030 node = rb_next(node);
1031 if (!node)
1032 break;
1033 total_bytes += state->end - state->start + 1;
1034 if (total_bytes >= max_bytes)
1035 break;
1036 }
1037out:
Chris Mason70dec802008-01-29 09:59:12 -05001038 spin_unlock_irq(&tree->lock);
Chris Masond1310b22008-01-24 16:13:08 -05001039 return found;
1040}
1041
1042u64 count_range_bits(struct extent_io_tree *tree,
1043 u64 *start, u64 search_end, u64 max_bytes,
1044 unsigned long bits)
1045{
1046 struct rb_node *node;
1047 struct extent_state *state;
1048 u64 cur_start = *start;
1049 u64 total_bytes = 0;
1050 int found = 0;
1051
1052 if (search_end <= cur_start) {
1053 printk("search_end %Lu start %Lu\n", search_end, cur_start);
1054 WARN_ON(1);
1055 return 0;
1056 }
1057
Chris Mason70dec802008-01-29 09:59:12 -05001058 spin_lock_irq(&tree->lock);
Chris Masond1310b22008-01-24 16:13:08 -05001059 if (cur_start == 0 && bits == EXTENT_DIRTY) {
1060 total_bytes = tree->dirty_bytes;
1061 goto out;
1062 }
1063 /*
1064 * this search will find all the extents that end after
1065 * our range starts.
1066 */
Chris Mason80ea96b2008-02-01 14:51:59 -05001067 node = tree_search(tree, cur_start);
Chris Masond1310b22008-01-24 16:13:08 -05001068 if (!node || IS_ERR(node)) {
1069 goto out;
1070 }
1071
1072 while(1) {
1073 state = rb_entry(node, struct extent_state, rb_node);
1074 if (state->start > search_end)
1075 break;
1076 if (state->end >= cur_start && (state->state & bits)) {
1077 total_bytes += min(search_end, state->end) + 1 -
1078 max(cur_start, state->start);
1079 if (total_bytes >= max_bytes)
1080 break;
1081 if (!found) {
1082 *start = state->start;
1083 found = 1;
1084 }
1085 }
1086 node = rb_next(node);
1087 if (!node)
1088 break;
1089 }
1090out:
Chris Mason70dec802008-01-29 09:59:12 -05001091 spin_unlock_irq(&tree->lock);
Chris Masond1310b22008-01-24 16:13:08 -05001092 return total_bytes;
1093}
1094/*
1095 * helper function to lock both pages and extents in the tree.
1096 * pages must be locked first.
1097 */
1098int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
1099{
1100 unsigned long index = start >> PAGE_CACHE_SHIFT;
1101 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1102 struct page *page;
1103 int err;
1104
1105 while (index <= end_index) {
1106 page = grab_cache_page(tree->mapping, index);
1107 if (!page) {
1108 err = -ENOMEM;
1109 goto failed;
1110 }
1111 if (IS_ERR(page)) {
1112 err = PTR_ERR(page);
1113 goto failed;
1114 }
1115 index++;
1116 }
1117 lock_extent(tree, start, end, GFP_NOFS);
1118 return 0;
1119
1120failed:
1121 /*
1122 * we failed above in getting the page at 'index', so we undo here
1123 * up to but not including the page at 'index'
1124 */
1125 end_index = index;
1126 index = start >> PAGE_CACHE_SHIFT;
1127 while (index < end_index) {
1128 page = find_get_page(tree->mapping, index);
1129 unlock_page(page);
1130 page_cache_release(page);
1131 index++;
1132 }
1133 return err;
1134}
1135EXPORT_SYMBOL(lock_range);
1136
1137/*
1138 * helper function to unlock both pages and extents in the tree.
1139 */
1140int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
1141{
1142 unsigned long index = start >> PAGE_CACHE_SHIFT;
1143 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1144 struct page *page;
1145
1146 while (index <= end_index) {
1147 page = find_get_page(tree->mapping, index);
1148 unlock_page(page);
1149 page_cache_release(page);
1150 index++;
1151 }
1152 unlock_extent(tree, start, end, GFP_NOFS);
1153 return 0;
1154}
1155EXPORT_SYMBOL(unlock_range);
1156
1157int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1158{
1159 struct rb_node *node;
1160 struct extent_state *state;
1161 int ret = 0;
1162
Chris Mason70dec802008-01-29 09:59:12 -05001163 spin_lock_irq(&tree->lock);
Chris Masond1310b22008-01-24 16:13:08 -05001164 /*
1165 * this search will find all the extents that end after
1166 * our range starts.
1167 */
Chris Mason80ea96b2008-02-01 14:51:59 -05001168 node = tree_search(tree, start);
Chris Masond1310b22008-01-24 16:13:08 -05001169 if (!node || IS_ERR(node)) {
1170 ret = -ENOENT;
1171 goto out;
1172 }
1173 state = rb_entry(node, struct extent_state, rb_node);
1174 if (state->start != start) {
1175 ret = -ENOENT;
1176 goto out;
1177 }
1178 state->private = private;
1179out:
Chris Mason70dec802008-01-29 09:59:12 -05001180 spin_unlock_irq(&tree->lock);
Chris Masond1310b22008-01-24 16:13:08 -05001181 return ret;
1182}
1183
1184int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1185{
1186 struct rb_node *node;
1187 struct extent_state *state;
1188 int ret = 0;
1189
Chris Mason70dec802008-01-29 09:59:12 -05001190 spin_lock_irq(&tree->lock);
Chris Masond1310b22008-01-24 16:13:08 -05001191 /*
1192 * this search will find all the extents that end after
1193 * our range starts.
1194 */
Chris Mason80ea96b2008-02-01 14:51:59 -05001195 node = tree_search(tree, start);
Chris Masond1310b22008-01-24 16:13:08 -05001196 if (!node || IS_ERR(node)) {
1197 ret = -ENOENT;
1198 goto out;
1199 }
1200 state = rb_entry(node, struct extent_state, rb_node);
1201 if (state->start != start) {
1202 ret = -ENOENT;
1203 goto out;
1204 }
1205 *private = state->private;
1206out:
Chris Mason70dec802008-01-29 09:59:12 -05001207 spin_unlock_irq(&tree->lock);
Chris Masond1310b22008-01-24 16:13:08 -05001208 return ret;
1209}
1210
1211/*
1212 * searches a range in the state tree for a given mask.
Chris Mason70dec802008-01-29 09:59:12 -05001213 * If 'filled' == 1, this returns 1 only if every extent in the tree
Chris Masond1310b22008-01-24 16:13:08 -05001214 * has the bits set. Otherwise, 1 is returned if any bit in the
1215 * range is found set.
1216 */
1217int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1218 int bits, int filled)
1219{
1220 struct extent_state *state = NULL;
1221 struct rb_node *node;
1222 int bitset = 0;
1223 unsigned long flags;
1224
Chris Mason70dec802008-01-29 09:59:12 -05001225 spin_lock_irqsave(&tree->lock, flags);
Chris Mason80ea96b2008-02-01 14:51:59 -05001226 node = tree_search(tree, start);
Chris Masond1310b22008-01-24 16:13:08 -05001227 while (node && start <= end) {
1228 state = rb_entry(node, struct extent_state, rb_node);
1229
1230 if (filled && state->start > start) {
1231 bitset = 0;
1232 break;
1233 }
1234
1235 if (state->start > end)
1236 break;
1237
1238 if (state->state & bits) {
1239 bitset = 1;
1240 if (!filled)
1241 break;
1242 } else if (filled) {
1243 bitset = 0;
1244 break;
1245 }
1246 start = state->end + 1;
1247 if (start > end)
1248 break;
1249 node = rb_next(node);
1250 if (!node) {
1251 if (filled)
1252 bitset = 0;
1253 break;
1254 }
1255 }
Chris Mason70dec802008-01-29 09:59:12 -05001256 spin_unlock_irqrestore(&tree->lock, flags);
Chris Masond1310b22008-01-24 16:13:08 -05001257 return bitset;
1258}
1259EXPORT_SYMBOL(test_range_bit);
1260
1261/*
1262 * helper function to set a given page up to date if all the
1263 * extents in the tree for that page are up to date
1264 */
1265static int check_page_uptodate(struct extent_io_tree *tree,
1266 struct page *page)
1267{
1268 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1269 u64 end = start + PAGE_CACHE_SIZE - 1;
1270 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
1271 SetPageUptodate(page);
1272 return 0;
1273}
1274
1275/*
1276 * helper function to unlock a page if all the extents in the tree
1277 * for that page are unlocked
1278 */
1279static int check_page_locked(struct extent_io_tree *tree,
1280 struct page *page)
1281{
1282 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1283 u64 end = start + PAGE_CACHE_SIZE - 1;
1284 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
1285 unlock_page(page);
1286 return 0;
1287}
1288
1289/*
1290 * helper function to end page writeback if all the extents
1291 * in the tree for that page are done with writeback
1292 */
1293static int check_page_writeback(struct extent_io_tree *tree,
1294 struct page *page)
1295{
1296 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1297 u64 end = start + PAGE_CACHE_SIZE - 1;
1298 if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
1299 end_page_writeback(page);
1300 return 0;
1301}
1302
1303/* lots and lots of room for performance fixes in the end_bio funcs */
1304
1305/*
1306 * after a writepage IO is done, we need to:
1307 * clear the uptodate bits on error
1308 * clear the writeback bits in the extent tree for this IO
1309 * end_page_writeback if the page has no more pending IO
1310 *
1311 * Scheduling is not allowed, so the extent state tree is expected
1312 * to have one and only one object corresponding to this IO.
1313 */
1314#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
1315static void end_bio_extent_writepage(struct bio *bio, int err)
1316#else
1317static int end_bio_extent_writepage(struct bio *bio,
1318 unsigned int bytes_done, int err)
1319#endif
1320{
1321 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1322 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
Chris Mason70dec802008-01-29 09:59:12 -05001323 struct extent_state *state = bio->bi_private;
1324 struct extent_io_tree *tree = state->tree;
1325 struct rb_node *node;
Chris Masond1310b22008-01-24 16:13:08 -05001326 u64 start;
1327 u64 end;
Chris Mason70dec802008-01-29 09:59:12 -05001328 u64 cur;
Chris Masond1310b22008-01-24 16:13:08 -05001329 int whole_page;
Chris Mason70dec802008-01-29 09:59:12 -05001330 unsigned long flags;
Chris Masond1310b22008-01-24 16:13:08 -05001331
1332#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1333 if (bio->bi_size)
1334 return 1;
1335#endif
Chris Masond1310b22008-01-24 16:13:08 -05001336 do {
1337 struct page *page = bvec->bv_page;
1338 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1339 bvec->bv_offset;
1340 end = start + bvec->bv_len - 1;
1341
1342 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1343 whole_page = 1;
1344 else
1345 whole_page = 0;
1346
1347 if (--bvec >= bio->bi_io_vec)
1348 prefetchw(&bvec->bv_page->flags);
1349
1350 if (!uptodate) {
1351 clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
1352 ClearPageUptodate(page);
1353 SetPageError(page);
1354 }
Chris Mason70dec802008-01-29 09:59:12 -05001355
1356 if (tree->ops && tree->ops->writepage_end_io_hook) {
1357 tree->ops->writepage_end_io_hook(page, start, end,
1358 state);
1359 }
1360
1361 /*
1362 * bios can get merged in funny ways, and so we need to
1363 * be careful with the state variable. We know the
1364 * state won't be merged with others because it has
1365 * WRITEBACK set, but we can't be sure each biovec is
1366 * sequential in the file. So, if our cached state
1367 * doesn't match the expected end, search the tree
1368 * for the correct one.
1369 */
1370
1371 spin_lock_irqsave(&tree->lock, flags);
1372 if (!state || state->end != end) {
1373 state = NULL;
Chris Mason80ea96b2008-02-01 14:51:59 -05001374 node = __etree_search(tree, start, NULL, NULL);
Chris Mason70dec802008-01-29 09:59:12 -05001375 if (node) {
1376 state = rb_entry(node, struct extent_state,
1377 rb_node);
1378 if (state->end != end ||
1379 !(state->state & EXTENT_WRITEBACK))
1380 state = NULL;
1381 }
1382 if (!state) {
1383 spin_unlock_irqrestore(&tree->lock, flags);
1384 clear_extent_writeback(tree, start,
1385 end, GFP_ATOMIC);
1386 goto next_io;
1387 }
1388 }
1389 cur = end;
1390 while(1) {
1391 struct extent_state *clear = state;
1392 cur = state->start;
1393 node = rb_prev(&state->rb_node);
1394 if (node) {
1395 state = rb_entry(node,
1396 struct extent_state,
1397 rb_node);
1398 } else {
1399 state = NULL;
1400 }
1401
1402 clear_state_bit(tree, clear, EXTENT_WRITEBACK,
1403 1, 0);
1404 if (cur == start)
1405 break;
1406 if (cur < start) {
1407 WARN_ON(1);
1408 break;
1409 }
1410 if (!node)
1411 break;
1412 }
1413 /* before releasing the lock, make sure the next state
1414 * variable has the expected bits set and corresponds
1415 * to the correct offsets in the file
1416 */
1417 if (state && (state->end + 1 != start ||
1418 !state->state & EXTENT_WRITEBACK)) {
1419 state = NULL;
1420 }
1421 spin_unlock_irqrestore(&tree->lock, flags);
1422next_io:
Chris Masond1310b22008-01-24 16:13:08 -05001423
1424 if (whole_page)
1425 end_page_writeback(page);
1426 else
1427 check_page_writeback(tree, page);
Chris Masond1310b22008-01-24 16:13:08 -05001428 } while (bvec >= bio->bi_io_vec);
Chris Masond1310b22008-01-24 16:13:08 -05001429 bio_put(bio);
1430#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1431 return 0;
1432#endif
1433}
1434
1435/*
1436 * after a readpage IO is done, we need to:
1437 * clear the uptodate bits on error
1438 * set the uptodate bits if things worked
1439 * set the page up to date if all extents in the tree are uptodate
1440 * clear the lock bit in the extent tree
1441 * unlock the page if there are no other extents locked for it
1442 *
1443 * Scheduling is not allowed, so the extent state tree is expected
1444 * to have one and only one object corresponding to this IO.
1445 */
1446#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
1447static void end_bio_extent_readpage(struct bio *bio, int err)
1448#else
1449static int end_bio_extent_readpage(struct bio *bio,
1450 unsigned int bytes_done, int err)
1451#endif
1452{
1453 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1454 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
Chris Mason70dec802008-01-29 09:59:12 -05001455 struct extent_state *state = bio->bi_private;
1456 struct extent_io_tree *tree = state->tree;
1457 struct rb_node *node;
Chris Masond1310b22008-01-24 16:13:08 -05001458 u64 start;
1459 u64 end;
Chris Mason70dec802008-01-29 09:59:12 -05001460 u64 cur;
1461 unsigned long flags;
Chris Masond1310b22008-01-24 16:13:08 -05001462 int whole_page;
1463 int ret;
1464
1465#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1466 if (bio->bi_size)
1467 return 1;
1468#endif
1469
1470 do {
1471 struct page *page = bvec->bv_page;
1472 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1473 bvec->bv_offset;
1474 end = start + bvec->bv_len - 1;
1475
1476 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1477 whole_page = 1;
1478 else
1479 whole_page = 0;
1480
1481 if (--bvec >= bio->bi_io_vec)
1482 prefetchw(&bvec->bv_page->flags);
1483
1484 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
Chris Mason70dec802008-01-29 09:59:12 -05001485 ret = tree->ops->readpage_end_io_hook(page, start, end,
1486 state);
Chris Masond1310b22008-01-24 16:13:08 -05001487 if (ret)
1488 uptodate = 0;
1489 }
Chris Mason70dec802008-01-29 09:59:12 -05001490
1491 spin_lock_irqsave(&tree->lock, flags);
1492 if (!state || state->end != end) {
1493 state = NULL;
Chris Mason80ea96b2008-02-01 14:51:59 -05001494 node = __etree_search(tree, start, NULL, NULL);
Chris Mason70dec802008-01-29 09:59:12 -05001495 if (node) {
1496 state = rb_entry(node, struct extent_state,
1497 rb_node);
1498 if (state->end != end ||
1499 !(state->state & EXTENT_LOCKED))
1500 state = NULL;
1501 }
1502 if (!state) {
1503 spin_unlock_irqrestore(&tree->lock, flags);
1504 set_extent_uptodate(tree, start, end,
1505 GFP_ATOMIC);
1506 unlock_extent(tree, start, end, GFP_ATOMIC);
1507 goto next_io;
1508 }
Chris Masond1310b22008-01-24 16:13:08 -05001509 }
1510
Chris Mason70dec802008-01-29 09:59:12 -05001511 cur = end;
1512 while(1) {
1513 struct extent_state *clear = state;
1514 cur = state->start;
1515 node = rb_prev(&state->rb_node);
1516 if (node) {
1517 state = rb_entry(node,
1518 struct extent_state,
1519 rb_node);
1520 } else {
1521 state = NULL;
1522 }
Chris Mason291d6732008-01-29 15:55:23 -05001523 set_state_cb(tree, clear, EXTENT_UPTODATE);
Chris Masonb0c68f82008-01-31 11:05:37 -05001524 clear->state |= EXTENT_UPTODATE;
Chris Mason70dec802008-01-29 09:59:12 -05001525 clear_state_bit(tree, clear, EXTENT_LOCKED,
1526 1, 0);
1527 if (cur == start)
1528 break;
1529 if (cur < start) {
1530 WARN_ON(1);
1531 break;
1532 }
1533 if (!node)
1534 break;
1535 }
1536 /* before releasing the lock, make sure the next state
1537 * variable has the expected bits set and corresponds
1538 * to the correct offsets in the file
1539 */
1540 if (state && (state->end + 1 != start ||
1541 !state->state & EXTENT_WRITEBACK)) {
1542 state = NULL;
1543 }
1544 spin_unlock_irqrestore(&tree->lock, flags);
1545next_io:
1546 if (whole_page) {
1547 if (uptodate) {
1548 SetPageUptodate(page);
1549 } else {
1550 ClearPageUptodate(page);
1551 SetPageError(page);
1552 }
Chris Masond1310b22008-01-24 16:13:08 -05001553 unlock_page(page);
Chris Mason70dec802008-01-29 09:59:12 -05001554 } else {
1555 if (uptodate) {
1556 check_page_uptodate(tree, page);
1557 } else {
1558 ClearPageUptodate(page);
1559 SetPageError(page);
1560 }
Chris Masond1310b22008-01-24 16:13:08 -05001561 check_page_locked(tree, page);
Chris Mason70dec802008-01-29 09:59:12 -05001562 }
Chris Masond1310b22008-01-24 16:13:08 -05001563 } while (bvec >= bio->bi_io_vec);
1564
1565 bio_put(bio);
1566#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1567 return 0;
1568#endif
1569}
1570
1571/*
1572 * IO done from prepare_write is pretty simple, we just unlock
1573 * the structs in the extent tree when done, and set the uptodate bits
1574 * as appropriate.
1575 */
1576#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
1577static void end_bio_extent_preparewrite(struct bio *bio, int err)
1578#else
1579static int end_bio_extent_preparewrite(struct bio *bio,
1580 unsigned int bytes_done, int err)
1581#endif
1582{
1583 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1584 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
Chris Mason70dec802008-01-29 09:59:12 -05001585 struct extent_state *state = bio->bi_private;
1586 struct extent_io_tree *tree = state->tree;
Chris Masond1310b22008-01-24 16:13:08 -05001587 u64 start;
1588 u64 end;
1589
1590#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1591 if (bio->bi_size)
1592 return 1;
1593#endif
1594
1595 do {
1596 struct page *page = bvec->bv_page;
1597 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1598 bvec->bv_offset;
1599 end = start + bvec->bv_len - 1;
1600
1601 if (--bvec >= bio->bi_io_vec)
1602 prefetchw(&bvec->bv_page->flags);
1603
1604 if (uptodate) {
1605 set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1606 } else {
1607 ClearPageUptodate(page);
1608 SetPageError(page);
1609 }
1610
1611 unlock_extent(tree, start, end, GFP_ATOMIC);
1612
1613 } while (bvec >= bio->bi_io_vec);
1614
1615 bio_put(bio);
1616#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1617 return 0;
1618#endif
1619}
1620
1621static struct bio *
1622extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1623 gfp_t gfp_flags)
1624{
1625 struct bio *bio;
1626
1627 bio = bio_alloc(gfp_flags, nr_vecs);
1628
1629 if (bio == NULL && (current->flags & PF_MEMALLOC)) {
1630 while (!bio && (nr_vecs /= 2))
1631 bio = bio_alloc(gfp_flags, nr_vecs);
1632 }
1633
1634 if (bio) {
1635 bio->bi_bdev = bdev;
1636 bio->bi_sector = first_sector;
1637 }
1638 return bio;
1639}
1640
1641static int submit_one_bio(int rw, struct bio *bio)
1642{
1643 u64 maxsector;
1644 int ret = 0;
Chris Mason70dec802008-01-29 09:59:12 -05001645 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1646 struct page *page = bvec->bv_page;
1647 struct extent_io_tree *tree = bio->bi_private;
1648 struct rb_node *node;
1649 struct extent_state *state;
1650 u64 start;
1651 u64 end;
1652
1653 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1654 end = start + bvec->bv_len - 1;
1655
1656 spin_lock_irq(&tree->lock);
Chris Mason80ea96b2008-02-01 14:51:59 -05001657 node = __etree_search(tree, start, NULL, NULL);
Chris Mason70dec802008-01-29 09:59:12 -05001658 BUG_ON(!node);
1659 state = rb_entry(node, struct extent_state, rb_node);
1660 while(state->end < end) {
1661 node = rb_next(node);
1662 state = rb_entry(node, struct extent_state, rb_node);
1663 }
1664 BUG_ON(state->end != end);
1665 spin_unlock_irq(&tree->lock);
1666
1667 bio->bi_private = state;
Chris Masond1310b22008-01-24 16:13:08 -05001668
1669 bio_get(bio);
1670
1671 maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
1672 if (maxsector < bio->bi_sector) {
1673 printk("sector too large max %Lu got %llu\n", maxsector,
1674 (unsigned long long)bio->bi_sector);
1675 WARN_ON(1);
1676 }
1677
1678 submit_bio(rw, bio);
1679 if (bio_flagged(bio, BIO_EOPNOTSUPP))
1680 ret = -EOPNOTSUPP;
1681 bio_put(bio);
1682 return ret;
1683}
1684
1685static int submit_extent_page(int rw, struct extent_io_tree *tree,
1686 struct page *page, sector_t sector,
1687 size_t size, unsigned long offset,
1688 struct block_device *bdev,
1689 struct bio **bio_ret,
1690 unsigned long max_pages,
1691 bio_end_io_t end_io_func)
1692{
1693 int ret = 0;
1694 struct bio *bio;
1695 int nr;
1696
1697 if (bio_ret && *bio_ret) {
1698 bio = *bio_ret;
1699 if (bio->bi_sector + (bio->bi_size >> 9) != sector ||
1700 bio_add_page(bio, page, size, offset) < size) {
1701 ret = submit_one_bio(rw, bio);
1702 bio = NULL;
1703 } else {
1704 return 0;
1705 }
1706 }
1707 nr = min_t(int, max_pages, bio_get_nr_vecs(bdev));
1708 bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1709 if (!bio) {
1710 printk("failed to allocate bio nr %d\n", nr);
1711 }
Chris Mason70dec802008-01-29 09:59:12 -05001712
1713
Chris Masond1310b22008-01-24 16:13:08 -05001714 bio_add_page(bio, page, size, offset);
1715 bio->bi_end_io = end_io_func;
1716 bio->bi_private = tree;
Chris Mason70dec802008-01-29 09:59:12 -05001717
Chris Masond1310b22008-01-24 16:13:08 -05001718 if (bio_ret) {
1719 *bio_ret = bio;
1720 } else {
1721 ret = submit_one_bio(rw, bio);
1722 }
1723
1724 return ret;
1725}
1726
1727void set_page_extent_mapped(struct page *page)
1728{
1729 if (!PagePrivate(page)) {
1730 SetPagePrivate(page);
1731 WARN_ON(!page->mapping->a_ops->invalidatepage);
1732 set_page_private(page, EXTENT_PAGE_PRIVATE);
1733 page_cache_get(page);
1734 }
1735}
1736
1737void set_page_extent_head(struct page *page, unsigned long len)
1738{
1739 set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
1740}
1741
1742/*
1743 * basic readpage implementation. Locked extent state structs are inserted
1744 * into the tree that are removed when the IO is done (by the end_io
1745 * handlers)
1746 */
1747static int __extent_read_full_page(struct extent_io_tree *tree,
1748 struct page *page,
1749 get_extent_t *get_extent,
1750 struct bio **bio)
1751{
1752 struct inode *inode = page->mapping->host;
1753 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1754 u64 page_end = start + PAGE_CACHE_SIZE - 1;
1755 u64 end;
1756 u64 cur = start;
1757 u64 extent_offset;
1758 u64 last_byte = i_size_read(inode);
1759 u64 block_start;
1760 u64 cur_end;
1761 sector_t sector;
1762 struct extent_map *em;
1763 struct block_device *bdev;
1764 int ret;
1765 int nr = 0;
1766 size_t page_offset = 0;
1767 size_t iosize;
1768 size_t blocksize = inode->i_sb->s_blocksize;
1769
1770 set_page_extent_mapped(page);
1771
1772 end = page_end;
1773 lock_extent(tree, start, end, GFP_NOFS);
1774
1775 while (cur <= end) {
1776 if (cur >= last_byte) {
1777 char *userpage;
1778 iosize = PAGE_CACHE_SIZE - page_offset;
1779 userpage = kmap_atomic(page, KM_USER0);
1780 memset(userpage + page_offset, 0, iosize);
1781 flush_dcache_page(page);
1782 kunmap_atomic(userpage, KM_USER0);
1783 set_extent_uptodate(tree, cur, cur + iosize - 1,
1784 GFP_NOFS);
1785 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1786 break;
1787 }
1788 em = get_extent(inode, page, page_offset, cur,
1789 end - cur + 1, 0);
1790 if (IS_ERR(em) || !em) {
1791 SetPageError(page);
1792 unlock_extent(tree, cur, end, GFP_NOFS);
1793 break;
1794 }
1795
1796 extent_offset = cur - em->start;
1797 BUG_ON(extent_map_end(em) <= cur);
1798 BUG_ON(end < cur);
1799
1800 iosize = min(extent_map_end(em) - cur, end - cur + 1);
1801 cur_end = min(extent_map_end(em) - 1, end);
1802 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
1803 sector = (em->block_start + extent_offset) >> 9;
1804 bdev = em->bdev;
1805 block_start = em->block_start;
1806 free_extent_map(em);
1807 em = NULL;
1808
1809 /* we've found a hole, just zero and go on */
1810 if (block_start == EXTENT_MAP_HOLE) {
1811 char *userpage;
1812 userpage = kmap_atomic(page, KM_USER0);
1813 memset(userpage + page_offset, 0, iosize);
1814 flush_dcache_page(page);
1815 kunmap_atomic(userpage, KM_USER0);
1816
1817 set_extent_uptodate(tree, cur, cur + iosize - 1,
1818 GFP_NOFS);
1819 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1820 cur = cur + iosize;
1821 page_offset += iosize;
1822 continue;
1823 }
1824 /* the get_extent function already copied into the page */
1825 if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
1826 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1827 cur = cur + iosize;
1828 page_offset += iosize;
1829 continue;
1830 }
Chris Mason70dec802008-01-29 09:59:12 -05001831 /* we have an inline extent but it didn't get marked up
1832 * to date. Error out
1833 */
1834 if (block_start == EXTENT_MAP_INLINE) {
1835 SetPageError(page);
1836 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1837 cur = cur + iosize;
1838 page_offset += iosize;
1839 continue;
1840 }
Chris Masond1310b22008-01-24 16:13:08 -05001841
1842 ret = 0;
1843 if (tree->ops && tree->ops->readpage_io_hook) {
1844 ret = tree->ops->readpage_io_hook(page, cur,
1845 cur + iosize - 1);
1846 }
1847 if (!ret) {
1848 unsigned long nr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
1849 nr -= page->index;
1850 ret = submit_extent_page(READ, tree, page,
1851 sector, iosize, page_offset,
1852 bdev, bio, nr,
1853 end_bio_extent_readpage);
1854 }
1855 if (ret)
1856 SetPageError(page);
1857 cur = cur + iosize;
1858 page_offset += iosize;
1859 nr++;
1860 }
1861 if (!nr) {
1862 if (!PageError(page))
1863 SetPageUptodate(page);
1864 unlock_page(page);
1865 }
1866 return 0;
1867}
1868
1869int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
1870 get_extent_t *get_extent)
1871{
1872 struct bio *bio = NULL;
1873 int ret;
1874
1875 ret = __extent_read_full_page(tree, page, get_extent, &bio);
1876 if (bio)
1877 submit_one_bio(READ, bio);
1878 return ret;
1879}
1880EXPORT_SYMBOL(extent_read_full_page);
1881
1882/*
1883 * the writepage semantics are similar to regular writepage. extent
1884 * records are inserted to lock ranges in the tree, and as dirty areas
1885 * are found, they are marked writeback. Then the lock bits are removed
1886 * and the end_io handler clears the writeback ranges
1887 */
1888static int __extent_writepage(struct page *page, struct writeback_control *wbc,
1889 void *data)
1890{
1891 struct inode *inode = page->mapping->host;
1892 struct extent_page_data *epd = data;
1893 struct extent_io_tree *tree = epd->tree;
1894 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1895 u64 delalloc_start;
1896 u64 page_end = start + PAGE_CACHE_SIZE - 1;
1897 u64 end;
1898 u64 cur = start;
1899 u64 extent_offset;
1900 u64 last_byte = i_size_read(inode);
1901 u64 block_start;
1902 u64 iosize;
1903 sector_t sector;
1904 struct extent_map *em;
1905 struct block_device *bdev;
1906 int ret;
1907 int nr = 0;
1908 size_t page_offset = 0;
1909 size_t blocksize;
1910 loff_t i_size = i_size_read(inode);
1911 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
1912 u64 nr_delalloc;
1913 u64 delalloc_end;
1914
1915 WARN_ON(!PageLocked(page));
1916 if (page->index > end_index) {
1917 clear_extent_dirty(tree, start, page_end, GFP_NOFS);
1918 unlock_page(page);
1919 return 0;
1920 }
1921
1922 if (page->index == end_index) {
1923 char *userpage;
1924
1925 size_t offset = i_size & (PAGE_CACHE_SIZE - 1);
1926
1927 userpage = kmap_atomic(page, KM_USER0);
1928 memset(userpage + offset, 0, PAGE_CACHE_SIZE - offset);
1929 flush_dcache_page(page);
1930 kunmap_atomic(userpage, KM_USER0);
1931 }
1932
1933 set_page_extent_mapped(page);
1934
1935 delalloc_start = start;
1936 delalloc_end = 0;
1937 while(delalloc_end < page_end) {
1938 nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start,
1939 &delalloc_end,
1940 128 * 1024 * 1024);
1941 if (nr_delalloc == 0) {
1942 delalloc_start = delalloc_end + 1;
1943 continue;
1944 }
1945 tree->ops->fill_delalloc(inode, delalloc_start,
1946 delalloc_end);
1947 clear_extent_bit(tree, delalloc_start,
1948 delalloc_end,
1949 EXTENT_LOCKED | EXTENT_DELALLOC,
1950 1, 0, GFP_NOFS);
1951 delalloc_start = delalloc_end + 1;
1952 }
1953 lock_extent(tree, start, page_end, GFP_NOFS);
1954
1955 end = page_end;
1956 if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
1957 printk("found delalloc bits after lock_extent\n");
1958 }
1959
1960 if (last_byte <= start) {
1961 clear_extent_dirty(tree, start, page_end, GFP_NOFS);
1962 goto done;
1963 }
1964
1965 set_extent_uptodate(tree, start, page_end, GFP_NOFS);
1966 blocksize = inode->i_sb->s_blocksize;
1967
1968 while (cur <= end) {
1969 if (cur >= last_byte) {
1970 clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
1971 break;
1972 }
1973 em = epd->get_extent(inode, page, page_offset, cur,
1974 end - cur + 1, 1);
1975 if (IS_ERR(em) || !em) {
1976 SetPageError(page);
1977 break;
1978 }
1979
1980 extent_offset = cur - em->start;
1981 BUG_ON(extent_map_end(em) <= cur);
1982 BUG_ON(end < cur);
1983 iosize = min(extent_map_end(em) - cur, end - cur + 1);
1984 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
1985 sector = (em->block_start + extent_offset) >> 9;
1986 bdev = em->bdev;
1987 block_start = em->block_start;
1988 free_extent_map(em);
1989 em = NULL;
1990
1991 if (block_start == EXTENT_MAP_HOLE ||
1992 block_start == EXTENT_MAP_INLINE) {
1993 clear_extent_dirty(tree, cur,
1994 cur + iosize - 1, GFP_NOFS);
1995 cur = cur + iosize;
1996 page_offset += iosize;
1997 continue;
1998 }
1999
2000 /* leave this out until we have a page_mkwrite call */
2001 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2002 EXTENT_DIRTY, 0)) {
2003 cur = cur + iosize;
2004 page_offset += iosize;
2005 continue;
2006 }
2007 clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
2008 if (tree->ops && tree->ops->writepage_io_hook) {
2009 ret = tree->ops->writepage_io_hook(page, cur,
2010 cur + iosize - 1);
2011 } else {
2012 ret = 0;
2013 }
2014 if (ret)
2015 SetPageError(page);
2016 else {
2017 unsigned long max_nr = end_index + 1;
2018 set_range_writeback(tree, cur, cur + iosize - 1);
2019 if (!PageWriteback(page)) {
2020 printk("warning page %lu not writeback, "
2021 "cur %llu end %llu\n", page->index,
2022 (unsigned long long)cur,
2023 (unsigned long long)end);
2024 }
2025
2026 ret = submit_extent_page(WRITE, tree, page, sector,
2027 iosize, page_offset, bdev,
2028 &epd->bio, max_nr,
2029 end_bio_extent_writepage);
2030 if (ret)
2031 SetPageError(page);
2032 }
2033 cur = cur + iosize;
2034 page_offset += iosize;
2035 nr++;
2036 }
2037done:
2038 if (nr == 0) {
2039 /* make sure the mapping tag for page dirty gets cleared */
2040 set_page_writeback(page);
2041 end_page_writeback(page);
2042 }
2043 unlock_extent(tree, start, page_end, GFP_NOFS);
2044 unlock_page(page);
2045 return 0;
2046}
2047
2048#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
2049
2050/* Taken directly from 2.6.23 for 2.6.18 back port */
2051typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
2052 void *data);
2053
2054/**
2055 * write_cache_pages - walk the list of dirty pages of the given address space
2056 * and write all of them.
2057 * @mapping: address space structure to write
2058 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2059 * @writepage: function called for each page
2060 * @data: data passed to writepage function
2061 *
2062 * If a page is already under I/O, write_cache_pages() skips it, even
2063 * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
2064 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
2065 * and msync() need to guarantee that all the data which was dirty at the time
2066 * the call was made get new I/O started against them. If wbc->sync_mode is
2067 * WB_SYNC_ALL then we were called for data integrity and we must wait for
2068 * existing IO to complete.
2069 */
2070static int write_cache_pages(struct address_space *mapping,
2071 struct writeback_control *wbc, writepage_t writepage,
2072 void *data)
2073{
2074 struct backing_dev_info *bdi = mapping->backing_dev_info;
2075 int ret = 0;
2076 int done = 0;
2077 struct pagevec pvec;
2078 int nr_pages;
2079 pgoff_t index;
2080 pgoff_t end; /* Inclusive */
2081 int scanned = 0;
2082 int range_whole = 0;
2083
2084 if (wbc->nonblocking && bdi_write_congested(bdi)) {
2085 wbc->encountered_congestion = 1;
2086 return 0;
2087 }
2088
2089 pagevec_init(&pvec, 0);
2090 if (wbc->range_cyclic) {
2091 index = mapping->writeback_index; /* Start from prev offset */
2092 end = -1;
2093 } else {
2094 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2095 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2096 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2097 range_whole = 1;
2098 scanned = 1;
2099 }
2100retry:
2101 while (!done && (index <= end) &&
2102 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
2103 PAGECACHE_TAG_DIRTY,
2104 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2105 unsigned i;
2106
2107 scanned = 1;
2108 for (i = 0; i < nr_pages; i++) {
2109 struct page *page = pvec.pages[i];
2110
2111 /*
2112 * At this point we hold neither mapping->tree_lock nor
2113 * lock on the page itself: the page may be truncated or
2114 * invalidated (changing page->mapping to NULL), or even
2115 * swizzled back from swapper_space to tmpfs file
2116 * mapping
2117 */
2118 lock_page(page);
2119
2120 if (unlikely(page->mapping != mapping)) {
2121 unlock_page(page);
2122 continue;
2123 }
2124
2125 if (!wbc->range_cyclic && page->index > end) {
2126 done = 1;
2127 unlock_page(page);
2128 continue;
2129 }
2130
2131 if (wbc->sync_mode != WB_SYNC_NONE)
2132 wait_on_page_writeback(page);
2133
2134 if (PageWriteback(page) ||
2135 !clear_page_dirty_for_io(page)) {
2136 unlock_page(page);
2137 continue;
2138 }
2139
2140 ret = (*writepage)(page, wbc, data);
2141
2142 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
2143 unlock_page(page);
2144 ret = 0;
2145 }
2146 if (ret || (--(wbc->nr_to_write) <= 0))
2147 done = 1;
2148 if (wbc->nonblocking && bdi_write_congested(bdi)) {
2149 wbc->encountered_congestion = 1;
2150 done = 1;
2151 }
2152 }
2153 pagevec_release(&pvec);
2154 cond_resched();
2155 }
2156 if (!scanned && !done) {
2157 /*
2158 * We hit the last page and there is more work to be done: wrap
2159 * back to the start of the file
2160 */
2161 scanned = 1;
2162 index = 0;
2163 goto retry;
2164 }
2165 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
2166 mapping->writeback_index = index;
2167 return ret;
2168}
2169#endif
2170
2171int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2172 get_extent_t *get_extent,
2173 struct writeback_control *wbc)
2174{
2175 int ret;
2176 struct address_space *mapping = page->mapping;
2177 struct extent_page_data epd = {
2178 .bio = NULL,
2179 .tree = tree,
2180 .get_extent = get_extent,
2181 };
2182 struct writeback_control wbc_writepages = {
2183 .bdi = wbc->bdi,
2184 .sync_mode = WB_SYNC_NONE,
2185 .older_than_this = NULL,
2186 .nr_to_write = 64,
2187 .range_start = page_offset(page) + PAGE_CACHE_SIZE,
2188 .range_end = (loff_t)-1,
2189 };
2190
2191
2192 ret = __extent_writepage(page, wbc, &epd);
2193
2194 write_cache_pages(mapping, &wbc_writepages, __extent_writepage, &epd);
2195 if (epd.bio) {
2196 submit_one_bio(WRITE, epd.bio);
2197 }
2198 return ret;
2199}
2200EXPORT_SYMBOL(extent_write_full_page);
2201
2202
2203int extent_writepages(struct extent_io_tree *tree,
2204 struct address_space *mapping,
2205 get_extent_t *get_extent,
2206 struct writeback_control *wbc)
2207{
2208 int ret = 0;
2209 struct extent_page_data epd = {
2210 .bio = NULL,
2211 .tree = tree,
2212 .get_extent = get_extent,
2213 };
2214
2215 ret = write_cache_pages(mapping, wbc, __extent_writepage, &epd);
2216 if (epd.bio) {
2217 submit_one_bio(WRITE, epd.bio);
2218 }
2219 return ret;
2220}
2221EXPORT_SYMBOL(extent_writepages);
2222
2223int extent_readpages(struct extent_io_tree *tree,
2224 struct address_space *mapping,
2225 struct list_head *pages, unsigned nr_pages,
2226 get_extent_t get_extent)
2227{
2228 struct bio *bio = NULL;
2229 unsigned page_idx;
2230 struct pagevec pvec;
2231
2232 pagevec_init(&pvec, 0);
2233 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2234 struct page *page = list_entry(pages->prev, struct page, lru);
2235
2236 prefetchw(&page->flags);
2237 list_del(&page->lru);
2238 /*
2239 * what we want to do here is call add_to_page_cache_lru,
2240 * but that isn't exported, so we reproduce it here
2241 */
2242 if (!add_to_page_cache(page, mapping,
2243 page->index, GFP_KERNEL)) {
2244
2245 /* open coding of lru_cache_add, also not exported */
2246 page_cache_get(page);
2247 if (!pagevec_add(&pvec, page))
2248 __pagevec_lru_add(&pvec);
2249 __extent_read_full_page(tree, page, get_extent, &bio);
2250 }
2251 page_cache_release(page);
2252 }
2253 if (pagevec_count(&pvec))
2254 __pagevec_lru_add(&pvec);
2255 BUG_ON(!list_empty(pages));
2256 if (bio)
2257 submit_one_bio(READ, bio);
2258 return 0;
2259}
2260EXPORT_SYMBOL(extent_readpages);
2261
2262/*
2263 * basic invalidatepage code, this waits on any locked or writeback
2264 * ranges corresponding to the page, and then deletes any extent state
2265 * records from the tree
2266 */
2267int extent_invalidatepage(struct extent_io_tree *tree,
2268 struct page *page, unsigned long offset)
2269{
2270 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2271 u64 end = start + PAGE_CACHE_SIZE - 1;
2272 size_t blocksize = page->mapping->host->i_sb->s_blocksize;
2273
2274 start += (offset + blocksize -1) & ~(blocksize - 1);
2275 if (start > end)
2276 return 0;
2277
2278 lock_extent(tree, start, end, GFP_NOFS);
2279 wait_on_extent_writeback(tree, start, end);
2280 clear_extent_bit(tree, start, end,
2281 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
2282 1, 1, GFP_NOFS);
2283 return 0;
2284}
2285EXPORT_SYMBOL(extent_invalidatepage);
2286
2287/*
2288 * simple commit_write call, set_range_dirty is used to mark both
2289 * the pages and the extent records as dirty
2290 */
2291int extent_commit_write(struct extent_io_tree *tree,
2292 struct inode *inode, struct page *page,
2293 unsigned from, unsigned to)
2294{
2295 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2296
2297 set_page_extent_mapped(page);
2298 set_page_dirty(page);
2299
2300 if (pos > inode->i_size) {
2301 i_size_write(inode, pos);
2302 mark_inode_dirty(inode);
2303 }
2304 return 0;
2305}
2306EXPORT_SYMBOL(extent_commit_write);
2307
2308int extent_prepare_write(struct extent_io_tree *tree,
2309 struct inode *inode, struct page *page,
2310 unsigned from, unsigned to, get_extent_t *get_extent)
2311{
2312 u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2313 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
2314 u64 block_start;
2315 u64 orig_block_start;
2316 u64 block_end;
2317 u64 cur_end;
2318 struct extent_map *em;
2319 unsigned blocksize = 1 << inode->i_blkbits;
2320 size_t page_offset = 0;
2321 size_t block_off_start;
2322 size_t block_off_end;
2323 int err = 0;
2324 int iocount = 0;
2325 int ret = 0;
2326 int isnew;
2327
2328 set_page_extent_mapped(page);
2329
2330 block_start = (page_start + from) & ~((u64)blocksize - 1);
2331 block_end = (page_start + to - 1) | (blocksize - 1);
2332 orig_block_start = block_start;
2333
2334 lock_extent(tree, page_start, page_end, GFP_NOFS);
2335 while(block_start <= block_end) {
2336 em = get_extent(inode, page, page_offset, block_start,
2337 block_end - block_start + 1, 1);
2338 if (IS_ERR(em) || !em) {
2339 goto err;
2340 }
2341 cur_end = min(block_end, extent_map_end(em) - 1);
2342 block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
2343 block_off_end = block_off_start + blocksize;
2344 isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
2345
2346 if (!PageUptodate(page) && isnew &&
2347 (block_off_end > to || block_off_start < from)) {
2348 void *kaddr;
2349
2350 kaddr = kmap_atomic(page, KM_USER0);
2351 if (block_off_end > to)
2352 memset(kaddr + to, 0, block_off_end - to);
2353 if (block_off_start < from)
2354 memset(kaddr + block_off_start, 0,
2355 from - block_off_start);
2356 flush_dcache_page(page);
2357 kunmap_atomic(kaddr, KM_USER0);
2358 }
2359 if ((em->block_start != EXTENT_MAP_HOLE &&
2360 em->block_start != EXTENT_MAP_INLINE) &&
2361 !isnew && !PageUptodate(page) &&
2362 (block_off_end > to || block_off_start < from) &&
2363 !test_range_bit(tree, block_start, cur_end,
2364 EXTENT_UPTODATE, 1)) {
2365 u64 sector;
2366 u64 extent_offset = block_start - em->start;
2367 size_t iosize;
2368 sector = (em->block_start + extent_offset) >> 9;
2369 iosize = (cur_end - block_start + blocksize) &
2370 ~((u64)blocksize - 1);
2371 /*
2372 * we've already got the extent locked, but we
2373 * need to split the state such that our end_bio
2374 * handler can clear the lock.
2375 */
2376 set_extent_bit(tree, block_start,
2377 block_start + iosize - 1,
2378 EXTENT_LOCKED, 0, NULL, GFP_NOFS);
2379 ret = submit_extent_page(READ, tree, page,
2380 sector, iosize, page_offset, em->bdev,
2381 NULL, 1,
2382 end_bio_extent_preparewrite);
2383 iocount++;
2384 block_start = block_start + iosize;
2385 } else {
2386 set_extent_uptodate(tree, block_start, cur_end,
2387 GFP_NOFS);
2388 unlock_extent(tree, block_start, cur_end, GFP_NOFS);
2389 block_start = cur_end + 1;
2390 }
2391 page_offset = block_start & (PAGE_CACHE_SIZE - 1);
2392 free_extent_map(em);
2393 }
2394 if (iocount) {
2395 wait_extent_bit(tree, orig_block_start,
2396 block_end, EXTENT_LOCKED);
2397 }
2398 check_page_uptodate(tree, page);
2399err:
2400 /* FIXME, zero out newly allocated blocks on error */
2401 return err;
2402}
2403EXPORT_SYMBOL(extent_prepare_write);
2404
2405/*
2406 * a helper for releasepage. As long as there are no locked extents
2407 * in the range corresponding to the page, both state records and extent
2408 * map records are removed
2409 */
2410int try_release_extent_mapping(struct extent_map_tree *map,
Chris Mason70dec802008-01-29 09:59:12 -05002411 struct extent_io_tree *tree, struct page *page,
2412 gfp_t mask)
Chris Masond1310b22008-01-24 16:13:08 -05002413{
2414 struct extent_map *em;
2415 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2416 u64 end = start + PAGE_CACHE_SIZE - 1;
2417 u64 orig_start = start;
2418 int ret = 1;
2419
Chris Mason70dec802008-01-29 09:59:12 -05002420 if ((mask & __GFP_WAIT) &&
2421 page->mapping->host->i_size > 16 * 1024 * 1024) {
2422 while (start <= end) {
2423 spin_lock(&map->lock);
2424 em = lookup_extent_mapping(map, start, end);
2425 if (!em || IS_ERR(em)) {
2426 spin_unlock(&map->lock);
2427 break;
2428 }
2429 if (em->start != start) {
2430 spin_unlock(&map->lock);
2431 free_extent_map(em);
2432 break;
2433 }
2434 if (!test_range_bit(tree, em->start,
2435 extent_map_end(em) - 1,
2436 EXTENT_LOCKED, 0)) {
2437 remove_extent_mapping(map, em);
2438 /* once for the rb tree */
2439 free_extent_map(em);
2440 }
2441 start = extent_map_end(em);
Chris Masond1310b22008-01-24 16:13:08 -05002442 spin_unlock(&map->lock);
Chris Mason70dec802008-01-29 09:59:12 -05002443
2444 /* once for us */
Chris Masond1310b22008-01-24 16:13:08 -05002445 free_extent_map(em);
2446 }
Chris Masond1310b22008-01-24 16:13:08 -05002447 }
Chris Mason70dec802008-01-29 09:59:12 -05002448 if (test_range_bit(tree, orig_start, end, EXTENT_IOBITS, 0))
Chris Masond1310b22008-01-24 16:13:08 -05002449 ret = 0;
Chris Mason70dec802008-01-29 09:59:12 -05002450 else {
2451 if ((mask & GFP_NOFS) == GFP_NOFS)
2452 mask = GFP_NOFS;
Chris Masond1310b22008-01-24 16:13:08 -05002453 clear_extent_bit(tree, orig_start, end, EXTENT_UPTODATE,
Chris Mason70dec802008-01-29 09:59:12 -05002454 1, 1, mask);
2455 }
Chris Masond1310b22008-01-24 16:13:08 -05002456 return ret;
2457}
2458EXPORT_SYMBOL(try_release_extent_mapping);
2459
2460sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
2461 get_extent_t *get_extent)
2462{
2463 struct inode *inode = mapping->host;
2464 u64 start = iblock << inode->i_blkbits;
2465 sector_t sector = 0;
2466 struct extent_map *em;
2467
2468 em = get_extent(inode, NULL, 0, start, (1 << inode->i_blkbits), 0);
2469 if (!em || IS_ERR(em))
2470 return 0;
2471
2472 if (em->block_start == EXTENT_MAP_INLINE ||
2473 em->block_start == EXTENT_MAP_HOLE)
2474 goto out;
2475
2476 sector = (em->block_start + start - em->start) >> inode->i_blkbits;
Chris Masond1310b22008-01-24 16:13:08 -05002477out:
2478 free_extent_map(em);
2479 return sector;
2480}
2481
2482static int add_lru(struct extent_io_tree *tree, struct extent_buffer *eb)
2483{
2484 if (list_empty(&eb->lru)) {
2485 extent_buffer_get(eb);
2486 list_add(&eb->lru, &tree->buffer_lru);
2487 tree->lru_size++;
2488 if (tree->lru_size >= BUFFER_LRU_MAX) {
2489 struct extent_buffer *rm;
2490 rm = list_entry(tree->buffer_lru.prev,
2491 struct extent_buffer, lru);
2492 tree->lru_size--;
2493 list_del_init(&rm->lru);
2494 free_extent_buffer(rm);
2495 }
2496 } else
2497 list_move(&eb->lru, &tree->buffer_lru);
2498 return 0;
2499}
2500static struct extent_buffer *find_lru(struct extent_io_tree *tree,
2501 u64 start, unsigned long len)
2502{
2503 struct list_head *lru = &tree->buffer_lru;
2504 struct list_head *cur = lru->next;
2505 struct extent_buffer *eb;
2506
2507 if (list_empty(lru))
2508 return NULL;
2509
2510 do {
2511 eb = list_entry(cur, struct extent_buffer, lru);
2512 if (eb->start == start && eb->len == len) {
2513 extent_buffer_get(eb);
2514 return eb;
2515 }
2516 cur = cur->next;
2517 } while (cur != lru);
2518 return NULL;
2519}
2520
2521static inline unsigned long num_extent_pages(u64 start, u64 len)
2522{
2523 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2524 (start >> PAGE_CACHE_SHIFT);
2525}
2526
2527static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2528 unsigned long i)
2529{
2530 struct page *p;
2531 struct address_space *mapping;
2532
2533 if (i == 0)
2534 return eb->first_page;
2535 i += eb->start >> PAGE_CACHE_SHIFT;
2536 mapping = eb->first_page->mapping;
2537 read_lock_irq(&mapping->tree_lock);
2538 p = radix_tree_lookup(&mapping->page_tree, i);
2539 read_unlock_irq(&mapping->tree_lock);
2540 return p;
2541}
2542
2543static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
2544 u64 start,
2545 unsigned long len,
2546 gfp_t mask)
2547{
2548 struct extent_buffer *eb = NULL;
2549
2550 spin_lock(&tree->lru_lock);
2551 eb = find_lru(tree, start, len);
2552 spin_unlock(&tree->lru_lock);
2553 if (eb) {
2554 return eb;
2555 }
2556
2557 eb = kmem_cache_zalloc(extent_buffer_cache, mask);
2558 INIT_LIST_HEAD(&eb->lru);
2559 eb->start = start;
2560 eb->len = len;
2561 atomic_set(&eb->refs, 1);
2562
2563 return eb;
2564}
2565
2566static void __free_extent_buffer(struct extent_buffer *eb)
2567{
2568 kmem_cache_free(extent_buffer_cache, eb);
2569}
2570
2571struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
2572 u64 start, unsigned long len,
2573 struct page *page0,
2574 gfp_t mask)
2575{
2576 unsigned long num_pages = num_extent_pages(start, len);
2577 unsigned long i;
2578 unsigned long index = start >> PAGE_CACHE_SHIFT;
2579 struct extent_buffer *eb;
2580 struct page *p;
2581 struct address_space *mapping = tree->mapping;
2582 int uptodate = 1;
2583
2584 eb = __alloc_extent_buffer(tree, start, len, mask);
2585 if (!eb || IS_ERR(eb))
2586 return NULL;
2587
2588 if (eb->flags & EXTENT_BUFFER_FILLED)
2589 goto lru_add;
2590
2591 if (page0) {
2592 eb->first_page = page0;
2593 i = 1;
2594 index++;
2595 page_cache_get(page0);
2596 mark_page_accessed(page0);
2597 set_page_extent_mapped(page0);
2598 WARN_ON(!PageUptodate(page0));
2599 set_page_extent_head(page0, len);
2600 } else {
2601 i = 0;
2602 }
2603 for (; i < num_pages; i++, index++) {
2604 p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
2605 if (!p) {
2606 WARN_ON(1);
2607 goto fail;
2608 }
2609 set_page_extent_mapped(p);
2610 mark_page_accessed(p);
2611 if (i == 0) {
2612 eb->first_page = p;
2613 set_page_extent_head(p, len);
2614 } else {
2615 set_page_private(p, EXTENT_PAGE_PRIVATE);
2616 }
2617 if (!PageUptodate(p))
2618 uptodate = 0;
2619 unlock_page(p);
2620 }
2621 if (uptodate)
2622 eb->flags |= EXTENT_UPTODATE;
2623 eb->flags |= EXTENT_BUFFER_FILLED;
2624
2625lru_add:
2626 spin_lock(&tree->lru_lock);
2627 add_lru(tree, eb);
2628 spin_unlock(&tree->lru_lock);
2629 return eb;
2630
2631fail:
2632 spin_lock(&tree->lru_lock);
2633 list_del_init(&eb->lru);
2634 spin_unlock(&tree->lru_lock);
2635 if (!atomic_dec_and_test(&eb->refs))
2636 return NULL;
2637 for (index = 1; index < i; index++) {
2638 page_cache_release(extent_buffer_page(eb, index));
2639 }
2640 if (i > 0)
2641 page_cache_release(extent_buffer_page(eb, 0));
2642 __free_extent_buffer(eb);
2643 return NULL;
2644}
2645EXPORT_SYMBOL(alloc_extent_buffer);
2646
2647struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
2648 u64 start, unsigned long len,
2649 gfp_t mask)
2650{
2651 unsigned long num_pages = num_extent_pages(start, len);
2652 unsigned long i;
2653 unsigned long index = start >> PAGE_CACHE_SHIFT;
2654 struct extent_buffer *eb;
2655 struct page *p;
2656 struct address_space *mapping = tree->mapping;
2657 int uptodate = 1;
2658
2659 eb = __alloc_extent_buffer(tree, start, len, mask);
2660 if (!eb || IS_ERR(eb))
2661 return NULL;
2662
2663 if (eb->flags & EXTENT_BUFFER_FILLED)
2664 goto lru_add;
2665
2666 for (i = 0; i < num_pages; i++, index++) {
2667 p = find_lock_page(mapping, index);
2668 if (!p) {
2669 goto fail;
2670 }
2671 set_page_extent_mapped(p);
2672 mark_page_accessed(p);
2673
2674 if (i == 0) {
2675 eb->first_page = p;
2676 set_page_extent_head(p, len);
2677 } else {
2678 set_page_private(p, EXTENT_PAGE_PRIVATE);
2679 }
2680
2681 if (!PageUptodate(p))
2682 uptodate = 0;
2683 unlock_page(p);
2684 }
2685 if (uptodate)
2686 eb->flags |= EXTENT_UPTODATE;
2687 eb->flags |= EXTENT_BUFFER_FILLED;
2688
2689lru_add:
2690 spin_lock(&tree->lru_lock);
2691 add_lru(tree, eb);
2692 spin_unlock(&tree->lru_lock);
2693 return eb;
2694fail:
2695 spin_lock(&tree->lru_lock);
2696 list_del_init(&eb->lru);
2697 spin_unlock(&tree->lru_lock);
2698 if (!atomic_dec_and_test(&eb->refs))
2699 return NULL;
2700 for (index = 1; index < i; index++) {
2701 page_cache_release(extent_buffer_page(eb, index));
2702 }
2703 if (i > 0)
2704 page_cache_release(extent_buffer_page(eb, 0));
2705 __free_extent_buffer(eb);
2706 return NULL;
2707}
2708EXPORT_SYMBOL(find_extent_buffer);
2709
2710void free_extent_buffer(struct extent_buffer *eb)
2711{
2712 unsigned long i;
2713 unsigned long num_pages;
2714
2715 if (!eb)
2716 return;
2717
2718 if (!atomic_dec_and_test(&eb->refs))
2719 return;
2720
2721 WARN_ON(!list_empty(&eb->lru));
2722 num_pages = num_extent_pages(eb->start, eb->len);
2723
2724 for (i = 1; i < num_pages; i++) {
2725 page_cache_release(extent_buffer_page(eb, i));
2726 }
2727 page_cache_release(extent_buffer_page(eb, 0));
2728 __free_extent_buffer(eb);
2729}
2730EXPORT_SYMBOL(free_extent_buffer);
2731
2732int clear_extent_buffer_dirty(struct extent_io_tree *tree,
2733 struct extent_buffer *eb)
2734{
2735 int set;
2736 unsigned long i;
2737 unsigned long num_pages;
2738 struct page *page;
2739
2740 u64 start = eb->start;
2741 u64 end = start + eb->len - 1;
2742
2743 set = clear_extent_dirty(tree, start, end, GFP_NOFS);
2744 num_pages = num_extent_pages(eb->start, eb->len);
2745
2746 for (i = 0; i < num_pages; i++) {
2747 page = extent_buffer_page(eb, i);
2748 lock_page(page);
2749 if (i == 0)
2750 set_page_extent_head(page, eb->len);
2751 else
2752 set_page_private(page, EXTENT_PAGE_PRIVATE);
2753
2754 /*
2755 * if we're on the last page or the first page and the
2756 * block isn't aligned on a page boundary, do extra checks
2757 * to make sure we don't clean page that is partially dirty
2758 */
2759 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
2760 ((i == num_pages - 1) &&
2761 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
2762 start = (u64)page->index << PAGE_CACHE_SHIFT;
2763 end = start + PAGE_CACHE_SIZE - 1;
2764 if (test_range_bit(tree, start, end,
2765 EXTENT_DIRTY, 0)) {
2766 unlock_page(page);
2767 continue;
2768 }
2769 }
2770 clear_page_dirty_for_io(page);
Chris Mason70dec802008-01-29 09:59:12 -05002771 read_lock_irq(&page->mapping->tree_lock);
Chris Masond1310b22008-01-24 16:13:08 -05002772 if (!PageDirty(page)) {
2773 radix_tree_tag_clear(&page->mapping->page_tree,
2774 page_index(page),
2775 PAGECACHE_TAG_DIRTY);
2776 }
Chris Mason70dec802008-01-29 09:59:12 -05002777 read_unlock_irq(&page->mapping->tree_lock);
Chris Masond1310b22008-01-24 16:13:08 -05002778 unlock_page(page);
2779 }
2780 return 0;
2781}
2782EXPORT_SYMBOL(clear_extent_buffer_dirty);
2783
2784int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
2785 struct extent_buffer *eb)
2786{
2787 return wait_on_extent_writeback(tree, eb->start,
2788 eb->start + eb->len - 1);
2789}
2790EXPORT_SYMBOL(wait_on_extent_buffer_writeback);
2791
2792int set_extent_buffer_dirty(struct extent_io_tree *tree,
2793 struct extent_buffer *eb)
2794{
2795 unsigned long i;
2796 unsigned long num_pages;
2797
2798 num_pages = num_extent_pages(eb->start, eb->len);
2799 for (i = 0; i < num_pages; i++) {
2800 struct page *page = extent_buffer_page(eb, i);
2801 /* writepage may need to do something special for the
2802 * first page, we have to make sure page->private is
2803 * properly set. releasepage may drop page->private
2804 * on us if the page isn't already dirty.
2805 */
2806 if (i == 0) {
2807 lock_page(page);
2808 set_page_extent_head(page, eb->len);
2809 } else if (PagePrivate(page) &&
2810 page->private != EXTENT_PAGE_PRIVATE) {
2811 lock_page(page);
2812 set_page_extent_mapped(page);
2813 unlock_page(page);
2814 }
2815 __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
2816 if (i == 0)
2817 unlock_page(page);
2818 }
2819 return set_extent_dirty(tree, eb->start,
2820 eb->start + eb->len - 1, GFP_NOFS);
2821}
2822EXPORT_SYMBOL(set_extent_buffer_dirty);
2823
2824int set_extent_buffer_uptodate(struct extent_io_tree *tree,
2825 struct extent_buffer *eb)
2826{
2827 unsigned long i;
2828 struct page *page;
2829 unsigned long num_pages;
2830
2831 num_pages = num_extent_pages(eb->start, eb->len);
2832
2833 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
2834 GFP_NOFS);
2835 for (i = 0; i < num_pages; i++) {
2836 page = extent_buffer_page(eb, i);
2837 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
2838 ((i == num_pages - 1) &&
2839 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
2840 check_page_uptodate(tree, page);
2841 continue;
2842 }
2843 SetPageUptodate(page);
2844 }
2845 return 0;
2846}
2847EXPORT_SYMBOL(set_extent_buffer_uptodate);
2848
2849int extent_buffer_uptodate(struct extent_io_tree *tree,
2850 struct extent_buffer *eb)
2851{
2852 if (eb->flags & EXTENT_UPTODATE)
2853 return 1;
2854 return test_range_bit(tree, eb->start, eb->start + eb->len - 1,
2855 EXTENT_UPTODATE, 1);
2856}
2857EXPORT_SYMBOL(extent_buffer_uptodate);
2858
2859int read_extent_buffer_pages(struct extent_io_tree *tree,
2860 struct extent_buffer *eb,
2861 u64 start,
2862 int wait)
2863{
2864 unsigned long i;
2865 unsigned long start_i;
2866 struct page *page;
2867 int err;
2868 int ret = 0;
2869 unsigned long num_pages;
2870
2871 if (eb->flags & EXTENT_UPTODATE)
2872 return 0;
2873
2874 if (0 && test_range_bit(tree, eb->start, eb->start + eb->len - 1,
2875 EXTENT_UPTODATE, 1)) {
2876 return 0;
2877 }
2878
2879 if (start) {
2880 WARN_ON(start < eb->start);
2881 start_i = (start >> PAGE_CACHE_SHIFT) -
2882 (eb->start >> PAGE_CACHE_SHIFT);
2883 } else {
2884 start_i = 0;
2885 }
2886
2887 num_pages = num_extent_pages(eb->start, eb->len);
2888 for (i = start_i; i < num_pages; i++) {
2889 page = extent_buffer_page(eb, i);
2890 if (PageUptodate(page)) {
2891 continue;
2892 }
2893 if (!wait) {
2894 if (TestSetPageLocked(page)) {
2895 continue;
2896 }
2897 } else {
2898 lock_page(page);
2899 }
2900 if (!PageUptodate(page)) {
2901 err = page->mapping->a_ops->readpage(NULL, page);
2902 if (err) {
2903 ret = err;
2904 }
2905 } else {
2906 unlock_page(page);
2907 }
2908 }
2909
2910 if (ret || !wait) {
2911 return ret;
2912 }
Chris Masond1310b22008-01-24 16:13:08 -05002913 for (i = start_i; i < num_pages; i++) {
2914 page = extent_buffer_page(eb, i);
2915 wait_on_page_locked(page);
2916 if (!PageUptodate(page)) {
2917 ret = -EIO;
2918 }
2919 }
2920 if (!ret)
2921 eb->flags |= EXTENT_UPTODATE;
2922 return ret;
2923}
2924EXPORT_SYMBOL(read_extent_buffer_pages);
2925
2926void read_extent_buffer(struct extent_buffer *eb, void *dstv,
2927 unsigned long start,
2928 unsigned long len)
2929{
2930 size_t cur;
2931 size_t offset;
2932 struct page *page;
2933 char *kaddr;
2934 char *dst = (char *)dstv;
2935 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2936 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2937 unsigned long num_pages = num_extent_pages(eb->start, eb->len);
2938
2939 WARN_ON(start > eb->len);
2940 WARN_ON(start + len > eb->start + eb->len);
2941
2942 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
2943
2944 while(len > 0) {
2945 page = extent_buffer_page(eb, i);
2946 if (!PageUptodate(page)) {
2947 printk("page %lu not up to date i %lu, total %lu, len %lu\n", page->index, i, num_pages, eb->len);
2948 WARN_ON(1);
2949 }
2950 WARN_ON(!PageUptodate(page));
2951
2952 cur = min(len, (PAGE_CACHE_SIZE - offset));
2953 kaddr = kmap_atomic(page, KM_USER1);
2954 memcpy(dst, kaddr + offset, cur);
2955 kunmap_atomic(kaddr, KM_USER1);
2956
2957 dst += cur;
2958 len -= cur;
2959 offset = 0;
2960 i++;
2961 }
2962}
2963EXPORT_SYMBOL(read_extent_buffer);
2964
2965int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
2966 unsigned long min_len, char **token, char **map,
2967 unsigned long *map_start,
2968 unsigned long *map_len, int km)
2969{
2970 size_t offset = start & (PAGE_CACHE_SIZE - 1);
2971 char *kaddr;
2972 struct page *p;
2973 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2974 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2975 unsigned long end_i = (start_offset + start + min_len - 1) >>
2976 PAGE_CACHE_SHIFT;
2977
2978 if (i != end_i)
2979 return -EINVAL;
2980
2981 if (i == 0) {
2982 offset = start_offset;
2983 *map_start = 0;
2984 } else {
2985 offset = 0;
2986 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
2987 }
2988 if (start + min_len > eb->len) {
2989printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len, start, min_len);
2990 WARN_ON(1);
2991 }
2992
2993 p = extent_buffer_page(eb, i);
2994 WARN_ON(!PageUptodate(p));
2995 kaddr = kmap_atomic(p, km);
2996 *token = kaddr;
2997 *map = kaddr + offset;
2998 *map_len = PAGE_CACHE_SIZE - offset;
2999 return 0;
3000}
3001EXPORT_SYMBOL(map_private_extent_buffer);
3002
3003int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3004 unsigned long min_len,
3005 char **token, char **map,
3006 unsigned long *map_start,
3007 unsigned long *map_len, int km)
3008{
3009 int err;
3010 int save = 0;
3011 if (eb->map_token) {
3012 unmap_extent_buffer(eb, eb->map_token, km);
3013 eb->map_token = NULL;
3014 save = 1;
3015 }
3016 err = map_private_extent_buffer(eb, start, min_len, token, map,
3017 map_start, map_len, km);
3018 if (!err && save) {
3019 eb->map_token = *token;
3020 eb->kaddr = *map;
3021 eb->map_start = *map_start;
3022 eb->map_len = *map_len;
3023 }
3024 return err;
3025}
3026EXPORT_SYMBOL(map_extent_buffer);
3027
3028void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
3029{
3030 kunmap_atomic(token, km);
3031}
3032EXPORT_SYMBOL(unmap_extent_buffer);
3033
3034int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
3035 unsigned long start,
3036 unsigned long len)
3037{
3038 size_t cur;
3039 size_t offset;
3040 struct page *page;
3041 char *kaddr;
3042 char *ptr = (char *)ptrv;
3043 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3044 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3045 int ret = 0;
3046
3047 WARN_ON(start > eb->len);
3048 WARN_ON(start + len > eb->start + eb->len);
3049
3050 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3051
3052 while(len > 0) {
3053 page = extent_buffer_page(eb, i);
3054 WARN_ON(!PageUptodate(page));
3055
3056 cur = min(len, (PAGE_CACHE_SIZE - offset));
3057
3058 kaddr = kmap_atomic(page, KM_USER0);
3059 ret = memcmp(ptr, kaddr + offset, cur);
3060 kunmap_atomic(kaddr, KM_USER0);
3061 if (ret)
3062 break;
3063
3064 ptr += cur;
3065 len -= cur;
3066 offset = 0;
3067 i++;
3068 }
3069 return ret;
3070}
3071EXPORT_SYMBOL(memcmp_extent_buffer);
3072
3073void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
3074 unsigned long start, unsigned long len)
3075{
3076 size_t cur;
3077 size_t offset;
3078 struct page *page;
3079 char *kaddr;
3080 char *src = (char *)srcv;
3081 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3082 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3083
3084 WARN_ON(start > eb->len);
3085 WARN_ON(start + len > eb->start + eb->len);
3086
3087 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3088
3089 while(len > 0) {
3090 page = extent_buffer_page(eb, i);
3091 WARN_ON(!PageUptodate(page));
3092
3093 cur = min(len, PAGE_CACHE_SIZE - offset);
3094 kaddr = kmap_atomic(page, KM_USER1);
3095 memcpy(kaddr + offset, src, cur);
3096 kunmap_atomic(kaddr, KM_USER1);
3097
3098 src += cur;
3099 len -= cur;
3100 offset = 0;
3101 i++;
3102 }
3103}
3104EXPORT_SYMBOL(write_extent_buffer);
3105
3106void memset_extent_buffer(struct extent_buffer *eb, char c,
3107 unsigned long start, unsigned long len)
3108{
3109 size_t cur;
3110 size_t offset;
3111 struct page *page;
3112 char *kaddr;
3113 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3114 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3115
3116 WARN_ON(start > eb->len);
3117 WARN_ON(start + len > eb->start + eb->len);
3118
3119 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3120
3121 while(len > 0) {
3122 page = extent_buffer_page(eb, i);
3123 WARN_ON(!PageUptodate(page));
3124
3125 cur = min(len, PAGE_CACHE_SIZE - offset);
3126 kaddr = kmap_atomic(page, KM_USER0);
3127 memset(kaddr + offset, c, cur);
3128 kunmap_atomic(kaddr, KM_USER0);
3129
3130 len -= cur;
3131 offset = 0;
3132 i++;
3133 }
3134}
3135EXPORT_SYMBOL(memset_extent_buffer);
3136
3137void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
3138 unsigned long dst_offset, unsigned long src_offset,
3139 unsigned long len)
3140{
3141 u64 dst_len = dst->len;
3142 size_t cur;
3143 size_t offset;
3144 struct page *page;
3145 char *kaddr;
3146 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3147 unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3148
3149 WARN_ON(src->len != dst_len);
3150
3151 offset = (start_offset + dst_offset) &
3152 ((unsigned long)PAGE_CACHE_SIZE - 1);
3153
3154 while(len > 0) {
3155 page = extent_buffer_page(dst, i);
3156 WARN_ON(!PageUptodate(page));
3157
3158 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
3159
3160 kaddr = kmap_atomic(page, KM_USER0);
3161 read_extent_buffer(src, kaddr + offset, src_offset, cur);
3162 kunmap_atomic(kaddr, KM_USER0);
3163
3164 src_offset += cur;
3165 len -= cur;
3166 offset = 0;
3167 i++;
3168 }
3169}
3170EXPORT_SYMBOL(copy_extent_buffer);
3171
3172static void move_pages(struct page *dst_page, struct page *src_page,
3173 unsigned long dst_off, unsigned long src_off,
3174 unsigned long len)
3175{
3176 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3177 if (dst_page == src_page) {
3178 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
3179 } else {
3180 char *src_kaddr = kmap_atomic(src_page, KM_USER1);
3181 char *p = dst_kaddr + dst_off + len;
3182 char *s = src_kaddr + src_off + len;
3183
3184 while (len--)
3185 *--p = *--s;
3186
3187 kunmap_atomic(src_kaddr, KM_USER1);
3188 }
3189 kunmap_atomic(dst_kaddr, KM_USER0);
3190}
3191
3192static void copy_pages(struct page *dst_page, struct page *src_page,
3193 unsigned long dst_off, unsigned long src_off,
3194 unsigned long len)
3195{
3196 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3197 char *src_kaddr;
3198
3199 if (dst_page != src_page)
3200 src_kaddr = kmap_atomic(src_page, KM_USER1);
3201 else
3202 src_kaddr = dst_kaddr;
3203
3204 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
3205 kunmap_atomic(dst_kaddr, KM_USER0);
3206 if (dst_page != src_page)
3207 kunmap_atomic(src_kaddr, KM_USER1);
3208}
3209
3210void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3211 unsigned long src_offset, unsigned long len)
3212{
3213 size_t cur;
3214 size_t dst_off_in_page;
3215 size_t src_off_in_page;
3216 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3217 unsigned long dst_i;
3218 unsigned long src_i;
3219
3220 if (src_offset + len > dst->len) {
3221 printk("memmove bogus src_offset %lu move len %lu len %lu\n",
3222 src_offset, len, dst->len);
3223 BUG_ON(1);
3224 }
3225 if (dst_offset + len > dst->len) {
3226 printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
3227 dst_offset, len, dst->len);
3228 BUG_ON(1);
3229 }
3230
3231 while(len > 0) {
3232 dst_off_in_page = (start_offset + dst_offset) &
3233 ((unsigned long)PAGE_CACHE_SIZE - 1);
3234 src_off_in_page = (start_offset + src_offset) &
3235 ((unsigned long)PAGE_CACHE_SIZE - 1);
3236
3237 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3238 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
3239
3240 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
3241 src_off_in_page));
3242 cur = min_t(unsigned long, cur,
3243 (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
3244
3245 copy_pages(extent_buffer_page(dst, dst_i),
3246 extent_buffer_page(dst, src_i),
3247 dst_off_in_page, src_off_in_page, cur);
3248
3249 src_offset += cur;
3250 dst_offset += cur;
3251 len -= cur;
3252 }
3253}
3254EXPORT_SYMBOL(memcpy_extent_buffer);
3255
3256void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3257 unsigned long src_offset, unsigned long len)
3258{
3259 size_t cur;
3260 size_t dst_off_in_page;
3261 size_t src_off_in_page;
3262 unsigned long dst_end = dst_offset + len - 1;
3263 unsigned long src_end = src_offset + len - 1;
3264 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3265 unsigned long dst_i;
3266 unsigned long src_i;
3267
3268 if (src_offset + len > dst->len) {
3269 printk("memmove bogus src_offset %lu move len %lu len %lu\n",
3270 src_offset, len, dst->len);
3271 BUG_ON(1);
3272 }
3273 if (dst_offset + len > dst->len) {
3274 printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
3275 dst_offset, len, dst->len);
3276 BUG_ON(1);
3277 }
3278 if (dst_offset < src_offset) {
3279 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
3280 return;
3281 }
3282 while(len > 0) {
3283 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
3284 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
3285
3286 dst_off_in_page = (start_offset + dst_end) &
3287 ((unsigned long)PAGE_CACHE_SIZE - 1);
3288 src_off_in_page = (start_offset + src_end) &
3289 ((unsigned long)PAGE_CACHE_SIZE - 1);
3290
3291 cur = min_t(unsigned long, len, src_off_in_page + 1);
3292 cur = min(cur, dst_off_in_page + 1);
3293 move_pages(extent_buffer_page(dst, dst_i),
3294 extent_buffer_page(dst, src_i),
3295 dst_off_in_page - cur + 1,
3296 src_off_in_page - cur + 1, cur);
3297
3298 dst_end -= cur;
3299 src_end -= cur;
3300 len -= cur;
3301 }
3302}
3303EXPORT_SYMBOL(memmove_extent_buffer);