| /* |
| * fs/ext4/extents_status.c |
| * |
| * Written by Yongqiang Yang <xiaoqiangnk@gmail.com> |
| * Modified by |
| * Allison Henderson <achender@linux.vnet.ibm.com> |
| * Hugh Dickins <hughd@google.com> |
| * Zheng Liu <wenqing.lz@taobao.com> |
| * |
| * Ext4 extents status tree core functions. |
| */ |
| #include <linux/rbtree.h> |
| #include "ext4.h" |
| #include "extents_status.h" |
| #include "ext4_extents.h" |
| |
| #include <trace/events/ext4.h> |
| |
| /* |
| * According to previous discussion in Ext4 Developer Workshop, we |
| * will introduce a new structure called io tree to track all extent |
| * status in order to solve some problems that we have met |
| * (e.g. Reservation space warning), and provide extent-level locking. |
| * Delay extent tree is the first step to achieve this goal. It is |
| * original built by Yongqiang Yang. At that time it is called delay |
| * extent tree, whose goal is only track delay extent in memory to |
| * simplify the implementation of fiemap and bigalloc, and introduce |
| * lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called |
| * delay extent tree at the following comment. But for better |
| * understand what it does, it has been rename to extent status tree. |
| * |
| * Currently the first step has been done. All delay extents are |
| * tracked in the tree. It maintains the delay extent when a delay |
| * allocation is issued, and the delay extent is written out or |
| * invalidated. Therefore the implementation of fiemap and bigalloc |
| * are simplified, and SEEK_DATA/SEEK_HOLE are introduced. |
| * |
| * The following comment describes the implemenmtation of extent |
| * status tree and future works. |
| */ |
| |
| /* |
| * extents status tree implementation for ext4. |
| * |
| * |
| * ========================================================================== |
| * Extents status encompass delayed extents and extent locks |
| * |
| * 1. Why delayed extent implementation ? |
| * |
| * Without delayed extent, ext4 identifies a delayed extent by looking |
| * up page cache, this has several deficiencies - complicated, buggy, |
| * and inefficient code. |
| * |
| * FIEMAP, SEEK_HOLE/DATA, bigalloc, punch hole and writeout all need |
| * to know if a block or a range of blocks are belonged to a delayed |
| * extent. |
| * |
| * Let us have a look at how they do without delayed extents implementation. |
| * -- FIEMAP |
| * FIEMAP looks up page cache to identify delayed allocations from holes. |
| * |
| * -- SEEK_HOLE/DATA |
| * SEEK_HOLE/DATA has the same problem as FIEMAP. |
| * |
| * -- bigalloc |
| * bigalloc looks up page cache to figure out if a block is |
| * already under delayed allocation or not to determine whether |
| * quota reserving is needed for the cluster. |
| * |
| * -- punch hole |
| * punch hole looks up page cache to identify a delayed extent. |
| * |
| * -- writeout |
| * Writeout looks up whole page cache to see if a buffer is |
| * mapped, If there are not very many delayed buffers, then it is |
| * time comsuming. |
| * |
| * With delayed extents implementation, FIEMAP, SEEK_HOLE/DATA, |
| * bigalloc and writeout can figure out if a block or a range of |
| * blocks is under delayed allocation(belonged to a delayed extent) or |
| * not by searching the delayed extent tree. |
| * |
| * |
| * ========================================================================== |
| * 2. ext4 delayed extents impelmentation |
| * |
| * -- delayed extent |
| * A delayed extent is a range of blocks which are contiguous |
| * logically and under delayed allocation. Unlike extent in |
| * ext4, delayed extent in ext4 is a in-memory struct, there is |
| * no corresponding on-disk data. There is no limit on length of |
| * delayed extent, so a delayed extent can contain as many blocks |
| * as they are contiguous logically. |
| * |
| * -- delayed extent tree |
| * Every inode has a delayed extent tree and all under delayed |
| * allocation blocks are added to the tree as delayed extents. |
| * Delayed extents in the tree are ordered by logical block no. |
| * |
| * -- operations on a delayed extent tree |
| * There are three operations on a delayed extent tree: find next |
| * delayed extent, adding a space(a range of blocks) and removing |
| * a space. |
| * |
| * -- race on a delayed extent tree |
| * Delayed extent tree is protected inode->i_es_lock. |
| * |
| * |
| * ========================================================================== |
| * 3. performance analysis |
| * -- overhead |
| * 1. There is a cache extent for write access, so if writes are |
| * not very random, adding space operaions are in O(1) time. |
| * |
| * -- gain |
| * 2. Code is much simpler, more readable, more maintainable and |
| * more efficient. |
| * |
| * |
| * ========================================================================== |
| * 4. TODO list |
| * -- Track all extent status |
| * |
| * -- Improve get block process |
| * |
| * -- Extent-level locking |
| */ |
| |
| static struct kmem_cache *ext4_es_cachep; |
| |
| int __init ext4_init_es(void) |
| { |
| ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT); |
| if (ext4_es_cachep == NULL) |
| return -ENOMEM; |
| return 0; |
| } |
| |
| void ext4_exit_es(void) |
| { |
| if (ext4_es_cachep) |
| kmem_cache_destroy(ext4_es_cachep); |
| } |
| |
| void ext4_es_init_tree(struct ext4_es_tree *tree) |
| { |
| tree->root = RB_ROOT; |
| tree->cache_es = NULL; |
| } |
| |
| #ifdef ES_DEBUG__ |
| static void ext4_es_print_tree(struct inode *inode) |
| { |
| struct ext4_es_tree *tree; |
| struct rb_node *node; |
| |
| printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino); |
| tree = &EXT4_I(inode)->i_es_tree; |
| node = rb_first(&tree->root); |
| while (node) { |
| struct extent_status *es; |
| es = rb_entry(node, struct extent_status, rb_node); |
| printk(KERN_DEBUG " [%u/%u)", es->start, es->len); |
| node = rb_next(node); |
| } |
| printk(KERN_DEBUG "\n"); |
| } |
| #else |
| #define ext4_es_print_tree(inode) |
| #endif |
| |
| static inline ext4_lblk_t extent_status_end(struct extent_status *es) |
| { |
| BUG_ON(es->start + es->len < es->start); |
| return es->start + es->len - 1; |
| } |
| |
| /* |
| * search through the tree for an delayed extent with a given offset. If |
| * it can't be found, try to find next extent. |
| */ |
| static struct extent_status *__es_tree_search(struct rb_root *root, |
| ext4_lblk_t offset) |
| { |
| struct rb_node *node = root->rb_node; |
| struct extent_status *es = NULL; |
| |
| while (node) { |
| es = rb_entry(node, struct extent_status, rb_node); |
| if (offset < es->start) |
| node = node->rb_left; |
| else if (offset > extent_status_end(es)) |
| node = node->rb_right; |
| else |
| return es; |
| } |
| |
| if (es && offset < es->start) |
| return es; |
| |
| if (es && offset > extent_status_end(es)) { |
| node = rb_next(&es->rb_node); |
| return node ? rb_entry(node, struct extent_status, rb_node) : |
| NULL; |
| } |
| |
| return NULL; |
| } |
| |
| /* |
| * ext4_es_find_extent: find the 1st delayed extent covering @es->start |
| * if it exists, otherwise, the next extent after @es->start. |
| * |
| * @inode: the inode which owns delayed extents |
| * @es: delayed extent that we found |
| * |
| * Returns the first block of the next extent after es, otherwise |
| * EXT_MAX_BLOCKS if no delay extent is found. |
| * Delayed extent is returned via @es. |
| */ |
| ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es) |
| { |
| struct ext4_es_tree *tree = NULL; |
| struct extent_status *es1 = NULL; |
| struct rb_node *node; |
| ext4_lblk_t ret = EXT_MAX_BLOCKS; |
| |
| trace_ext4_es_find_extent_enter(inode, es->start); |
| |
| read_lock(&EXT4_I(inode)->i_es_lock); |
| tree = &EXT4_I(inode)->i_es_tree; |
| |
| /* find delay extent in cache firstly */ |
| if (tree->cache_es) { |
| es1 = tree->cache_es; |
| if (in_range(es->start, es1->start, es1->len)) { |
| es_debug("%u cached by [%u/%u)\n", |
| es->start, es1->start, es1->len); |
| goto out; |
| } |
| } |
| |
| es->len = 0; |
| es1 = __es_tree_search(&tree->root, es->start); |
| |
| out: |
| if (es1) { |
| tree->cache_es = es1; |
| es->start = es1->start; |
| es->len = es1->len; |
| node = rb_next(&es1->rb_node); |
| if (node) { |
| es1 = rb_entry(node, struct extent_status, rb_node); |
| ret = es1->start; |
| } |
| } |
| |
| read_unlock(&EXT4_I(inode)->i_es_lock); |
| |
| trace_ext4_es_find_extent_exit(inode, es, ret); |
| return ret; |
| } |
| |
| static struct extent_status * |
| ext4_es_alloc_extent(ext4_lblk_t start, ext4_lblk_t len) |
| { |
| struct extent_status *es; |
| es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC); |
| if (es == NULL) |
| return NULL; |
| es->start = start; |
| es->len = len; |
| return es; |
| } |
| |
| static void ext4_es_free_extent(struct extent_status *es) |
| { |
| kmem_cache_free(ext4_es_cachep, es); |
| } |
| |
| static struct extent_status * |
| ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es) |
| { |
| struct extent_status *es1; |
| struct rb_node *node; |
| |
| node = rb_prev(&es->rb_node); |
| if (!node) |
| return es; |
| |
| es1 = rb_entry(node, struct extent_status, rb_node); |
| if (es->start == extent_status_end(es1) + 1) { |
| es1->len += es->len; |
| rb_erase(&es->rb_node, &tree->root); |
| ext4_es_free_extent(es); |
| es = es1; |
| } |
| |
| return es; |
| } |
| |
| static struct extent_status * |
| ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es) |
| { |
| struct extent_status *es1; |
| struct rb_node *node; |
| |
| node = rb_next(&es->rb_node); |
| if (!node) |
| return es; |
| |
| es1 = rb_entry(node, struct extent_status, rb_node); |
| if (es1->start == extent_status_end(es) + 1) { |
| es->len += es1->len; |
| rb_erase(node, &tree->root); |
| ext4_es_free_extent(es1); |
| } |
| |
| return es; |
| } |
| |
| static int __es_insert_extent(struct ext4_es_tree *tree, ext4_lblk_t offset, |
| ext4_lblk_t len) |
| { |
| struct rb_node **p = &tree->root.rb_node; |
| struct rb_node *parent = NULL; |
| struct extent_status *es; |
| ext4_lblk_t end = offset + len - 1; |
| |
| BUG_ON(end < offset); |
| es = tree->cache_es; |
| if (es && offset == (extent_status_end(es) + 1)) { |
| es_debug("cached by [%u/%u)\n", es->start, es->len); |
| es->len += len; |
| es = ext4_es_try_to_merge_right(tree, es); |
| goto out; |
| } else if (es && es->start == end + 1) { |
| es_debug("cached by [%u/%u)\n", es->start, es->len); |
| es->start = offset; |
| es->len += len; |
| es = ext4_es_try_to_merge_left(tree, es); |
| goto out; |
| } else if (es && es->start <= offset && |
| end <= extent_status_end(es)) { |
| es_debug("cached by [%u/%u)\n", es->start, es->len); |
| goto out; |
| } |
| |
| while (*p) { |
| parent = *p; |
| es = rb_entry(parent, struct extent_status, rb_node); |
| |
| if (offset < es->start) { |
| if (es->start == end + 1) { |
| es->start = offset; |
| es->len += len; |
| es = ext4_es_try_to_merge_left(tree, es); |
| goto out; |
| } |
| p = &(*p)->rb_left; |
| } else if (offset > extent_status_end(es)) { |
| if (offset == extent_status_end(es) + 1) { |
| es->len += len; |
| es = ext4_es_try_to_merge_right(tree, es); |
| goto out; |
| } |
| p = &(*p)->rb_right; |
| } else { |
| if (extent_status_end(es) <= end) |
| es->len = offset - es->start + len; |
| goto out; |
| } |
| } |
| |
| es = ext4_es_alloc_extent(offset, len); |
| if (!es) |
| return -ENOMEM; |
| rb_link_node(&es->rb_node, parent, p); |
| rb_insert_color(&es->rb_node, &tree->root); |
| |
| out: |
| tree->cache_es = es; |
| return 0; |
| } |
| |
| /* |
| * ext4_es_insert_extent() adds a space to a delayed extent tree. |
| * Caller holds inode->i_es_lock. |
| * |
| * ext4_es_insert_extent is called by ext4_da_write_begin and |
| * ext4_es_remove_extent. |
| * |
| * Return 0 on success, error code on failure. |
| */ |
| int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t offset, |
| ext4_lblk_t len) |
| { |
| struct ext4_es_tree *tree; |
| int err = 0; |
| |
| trace_ext4_es_insert_extent(inode, offset, len); |
| es_debug("add [%u/%u) to extent status tree of inode %lu\n", |
| offset, len, inode->i_ino); |
| |
| write_lock(&EXT4_I(inode)->i_es_lock); |
| tree = &EXT4_I(inode)->i_es_tree; |
| err = __es_insert_extent(tree, offset, len); |
| write_unlock(&EXT4_I(inode)->i_es_lock); |
| |
| ext4_es_print_tree(inode); |
| |
| return err; |
| } |
| |
| /* |
| * ext4_es_remove_extent() removes a space from a delayed extent tree. |
| * Caller holds inode->i_es_lock. |
| * |
| * Return 0 on success, error code on failure. |
| */ |
| int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset, |
| ext4_lblk_t len) |
| { |
| struct rb_node *node; |
| struct ext4_es_tree *tree; |
| struct extent_status *es; |
| struct extent_status orig_es; |
| ext4_lblk_t len1, len2, end; |
| int err = 0; |
| |
| trace_ext4_es_remove_extent(inode, offset, len); |
| es_debug("remove [%u/%u) from extent status tree of inode %lu\n", |
| offset, len, inode->i_ino); |
| |
| end = offset + len - 1; |
| BUG_ON(end < offset); |
| write_lock(&EXT4_I(inode)->i_es_lock); |
| tree = &EXT4_I(inode)->i_es_tree; |
| es = __es_tree_search(&tree->root, offset); |
| if (!es) |
| goto out; |
| if (es->start > end) |
| goto out; |
| |
| /* Simply invalidate cache_es. */ |
| tree->cache_es = NULL; |
| |
| orig_es.start = es->start; |
| orig_es.len = es->len; |
| len1 = offset > es->start ? offset - es->start : 0; |
| len2 = extent_status_end(es) > end ? |
| extent_status_end(es) - end : 0; |
| if (len1 > 0) |
| es->len = len1; |
| if (len2 > 0) { |
| if (len1 > 0) { |
| err = __es_insert_extent(tree, end + 1, len2); |
| if (err) { |
| es->start = orig_es.start; |
| es->len = orig_es.len; |
| goto out; |
| } |
| } else { |
| es->start = end + 1; |
| es->len = len2; |
| } |
| goto out; |
| } |
| |
| if (len1 > 0) { |
| node = rb_next(&es->rb_node); |
| if (node) |
| es = rb_entry(node, struct extent_status, rb_node); |
| else |
| es = NULL; |
| } |
| |
| while (es && extent_status_end(es) <= end) { |
| node = rb_next(&es->rb_node); |
| rb_erase(&es->rb_node, &tree->root); |
| ext4_es_free_extent(es); |
| if (!node) { |
| es = NULL; |
| break; |
| } |
| es = rb_entry(node, struct extent_status, rb_node); |
| } |
| |
| if (es && es->start < end + 1) { |
| len1 = extent_status_end(es) - end; |
| es->start = end + 1; |
| es->len = len1; |
| } |
| |
| out: |
| write_unlock(&EXT4_I(inode)->i_es_lock); |
| ext4_es_print_tree(inode); |
| return err; |
| } |