Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6: (52 commits)
split invalidate_inodes()
fs: skip I_FREEING inodes in writeback_sb_inodes
fs: fold invalidate_list into invalidate_inodes
fs: do not drop inode_lock in dispose_list
fs: inode split IO and LRU lists
fs: switch bdev inode bdi's correctly
fs: fix buffer invalidation in invalidate_list
fsnotify: use dget_parent
smbfs: use dget_parent
exportfs: use dget_parent
fs: use RCU read side protection in d_validate
fs: clean up dentry lru modification
fs: split __shrink_dcache_sb
fs: improve DCACHE_REFERENCED usage
fs: use percpu counter for nr_dentry and nr_dentry_unused
fs: simplify __d_free
fs: take dcache_lock inside __d_path
fs: do not assign default i_ino in new_inode
fs: introduce a per-cpu last_ino allocator
new helper: ihold()
...
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 2db4283..8a817f6 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -349,21 +349,36 @@
--------------------------- block_device_operations -----------------------
prototypes:
- int (*open) (struct inode *, struct file *);
- int (*release) (struct inode *, struct file *);
- int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long);
+ int (*open) (struct block_device *, fmode_t);
+ int (*release) (struct gendisk *, fmode_t);
+ int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
+ int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
+ int (*direct_access) (struct block_device *, sector_t, void **, unsigned long *);
int (*media_changed) (struct gendisk *);
+ void (*unlock_native_capacity) (struct gendisk *);
int (*revalidate_disk) (struct gendisk *);
+ int (*getgeo)(struct block_device *, struct hd_geometry *);
+ void (*swap_slot_free_notify) (struct block_device *, unsigned long);
locking rules:
- BKL bd_sem
-open: yes yes
-release: yes yes
-ioctl: yes no
+ BKL bd_mutex
+open: no yes
+release: no yes
+ioctl: no no
+compat_ioctl: no no
+direct_access: no no
media_changed: no no
+unlock_native_capacity: no no
revalidate_disk: no no
+getgeo: no no
+swap_slot_free_notify: no no (see below)
-The last two are called only from check_disk_change().
+media_changed, unlock_native_capacity and revalidate_disk are called only from
+check_disk_change().
+
+swap_slot_free_notify is called with swap_lock and sometimes the page lock
+held.
+
--------------------------- file_operations -------------------------------
prototypes:
diff --git a/Documentation/filesystems/sharedsubtree.txt b/Documentation/filesystems/sharedsubtree.txt
index fc0e39a..4ede421 100644
--- a/Documentation/filesystems/sharedsubtree.txt
+++ b/Documentation/filesystems/sharedsubtree.txt
@@ -62,10 +62,10 @@
# mount /dev/sd0 /tmp/a
#ls /tmp/a
- t1 t2 t2
+ t1 t2 t3
#ls /mnt/a
- t1 t2 t2
+ t1 t2 t3
Note that the mount has propagated to the mount at /mnt as well.
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index e985b1c..1256454 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -876,6 +876,10 @@
if (dev->dev_info)
filp->f_mapping->backing_dev_info = dev->dev_info;
+ /* Is /dev/mem or /dev/kmem ? */
+ if (dev->dev_info == &directly_mappable_cdev_bdi)
+ filp->f_mode |= FMODE_UNSIGNED_OFFSET;
+
if (dev->fops->open)
return dev->fops->open(inode, filp);
diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c
index d13e726..12d5bf7 100644
--- a/drivers/infiniband/hw/ipath/ipath_fs.c
+++ b/drivers/infiniband/hw/ipath/ipath_fs.c
@@ -57,6 +57,7 @@
goto bail;
}
+ inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
inode->i_private = data;
diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c
index a0e6613..7e433d7 100644
--- a/drivers/infiniband/hw/qib/qib_fs.c
+++ b/drivers/infiniband/hw/qib/qib_fs.c
@@ -58,6 +58,7 @@
goto bail;
}
+ inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_uid = 0;
inode->i_gid = 0;
diff --git a/drivers/misc/ibmasm/ibmasmfs.c b/drivers/misc/ibmasm/ibmasmfs.c
index af2497a..0a53500 100644
--- a/drivers/misc/ibmasm/ibmasmfs.c
+++ b/drivers/misc/ibmasm/ibmasmfs.c
@@ -146,6 +146,7 @@
struct inode *ret = new_inode(sb);
if (ret) {
+ ret->i_ino = get_next_ino();
ret->i_mode = mode;
ret->i_atime = ret->i_mtime = ret->i_ctime = CURRENT_TIME;
}
diff --git a/drivers/oprofile/oprofilefs.c b/drivers/oprofile/oprofilefs.c
index 95f711b..449de59 100644
--- a/drivers/oprofile/oprofilefs.c
+++ b/drivers/oprofile/oprofilefs.c
@@ -28,6 +28,7 @@
struct inode *inode = new_inode(sb);
if (inode) {
+ inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
}
diff --git a/drivers/staging/pohmelfs/inode.c b/drivers/staging/pohmelfs/inode.c
index 97dae29..c62d300 100644
--- a/drivers/staging/pohmelfs/inode.c
+++ b/drivers/staging/pohmelfs/inode.c
@@ -882,12 +882,8 @@
static int pohmelfs_fsync(struct file *file, int datasync)
{
struct inode *inode = file->f_mapping->host;
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = 0, /* sys_fsync did this */
- };
- return sync_inode(inode, &wbc);
+ return sync_inode_metadata(inode, 1);
}
ssize_t pohmelfs_write(struct file *file, const char __user *buf,
diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c
index 095fa53..e2f63c0 100644
--- a/drivers/usb/core/inode.c
+++ b/drivers/usb/core/inode.c
@@ -276,6 +276,7 @@
struct inode *inode = new_inode(sb);
if (inode) {
+ inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
diff --git a/drivers/usb/gadget/f_fs.c b/drivers/usb/gadget/f_fs.c
index e4f5950..e093fd8 100644
--- a/drivers/usb/gadget/f_fs.c
+++ b/drivers/usb/gadget/f_fs.c
@@ -980,6 +980,7 @@
if (likely(inode)) {
struct timespec current_time = CURRENT_TIME;
+ inode->i_ino = usbfs_get_inode();
inode->i_mode = perms->mode;
inode->i_uid = perms->uid;
inode->i_gid = perms->gid;
diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c
index d1d72d9..ba145e7 100644
--- a/drivers/usb/gadget/inode.c
+++ b/drivers/usb/gadget/inode.c
@@ -1991,6 +1991,7 @@
struct inode *inode = new_inode (sb);
if (inode) {
+ inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_uid = default_uid;
inode->i_gid = default_gid;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 9e670d5..ef5905f 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1789,9 +1789,10 @@
kfree(st);
} else {
/* Caching disabled. No need to get upto date stat info.
- * This dentry will be released immediately. So, just i_count++
+ * This dentry will be released immediately. So, just hold the
+ * inode
*/
- atomic_inc(&old_dentry->d_inode->i_count);
+ ihold(old_dentry->d_inode);
}
dentry->d_op = old_dentry->d_op;
diff --git a/fs/affs/file.c b/fs/affs/file.c
index c4a9875..0a90dcd 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -894,9 +894,9 @@
if (AFFS_SB(sb)->s_flags & SF_OFS) {
struct buffer_head *bh = affs_bread_ino(inode, last_blk, 0);
u32 tmp;
- if (IS_ERR(ext_bh)) {
+ if (IS_ERR(bh)) {
affs_warning(sb, "truncate", "unexpected read error for last block %u (%d)",
- ext, PTR_ERR(ext_bh));
+ ext, PTR_ERR(bh));
return;
}
tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next);
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 3a0fdec..5d82890 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -388,7 +388,7 @@
affs_adjust_checksum(inode_bh, block - be32_to_cpu(chain));
mark_buffer_dirty_inode(inode_bh, inode);
inode->i_nlink = 2;
- atomic_inc(&inode->i_count);
+ ihold(inode);
}
affs_fix_checksum(sb, bh);
mark_buffer_dirty_inode(bh, inode);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 0d38c09..5439e1b 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -1045,7 +1045,7 @@
if (ret < 0)
goto link_error;
- atomic_inc(&vnode->vfs_inode.i_count);
+ ihold(&vnode->vfs_inode);
d_instantiate(dentry, &vnode->vfs_inode);
key_put(key);
_leave(" = 0");
diff --git a/fs/aio.c b/fs/aio.c
index 250b0a7..8c8f6c5 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1543,7 +1543,19 @@
}
abe = mempool_alloc(abe_pool, GFP_KERNEL);
- BUG_ON(!igrab(mapping->host));
+
+ /*
+ * we should be using igrab here, but
+ * we don't want to hammer on the global
+ * inode spinlock just to take an extra
+ * reference on a file that we must already
+ * have a reference to.
+ *
+ * When we're called, we always have a reference
+ * on the file, so we must always have a reference
+ * on the inode, so ihold() is safe here.
+ */
+ ihold(mapping->host);
abe->mapping = mapping;
hlist_add_head(&abe->list, &batch_hash[bucket]);
return;
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index e4b75d6..5365527 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -111,10 +111,9 @@
path.mnt = mntget(anon_inode_mnt);
/*
* We know the anon_inode inode count is always greater than zero,
- * so we can avoid doing an igrab() and we can use an open-coded
- * atomic_inc().
+ * so ihold() is safe.
*/
- atomic_inc(&anon_inode_inode->i_count);
+ ihold(anon_inode_inode);
path.dentry->d_op = &anon_inodefs_dentry_operations;
d_instantiate(path.dentry, anon_inode_inode);
@@ -194,6 +193,7 @@
if (!inode)
return ERR_PTR(-ENOMEM);
+ inode->i_ino = get_next_ino();
inode->i_fop = &anon_inode_fops;
inode->i_mapping->a_ops = &anon_aops;
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 821b2b9..ac87e49 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -398,6 +398,7 @@
inode->i_gid = sb->s_root->d_inode->i_gid;
}
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ inode->i_ino = get_next_ino();
if (S_ISDIR(inf->mode)) {
inode->i_nlink = 2;
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index d967e05..685ecff 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -176,7 +176,7 @@
inc_nlink(inode);
inode->i_ctime = CURRENT_TIME_SEC;
mark_inode_dirty(inode);
- atomic_inc(&inode->i_count);
+ ihold(inode);
d_instantiate(new, inode);
mutex_unlock(&info->bfs_lock);
return 0;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 139fc80..29990f0 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -495,6 +495,7 @@
struct inode * inode = new_inode(sb);
if (inode) {
+ inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_atime = inode->i_mtime = inode->i_ctime =
current_fs_time(inode->i_sb);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index b737451..dea3b62 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -48,6 +48,21 @@
EXPORT_SYMBOL(I_BDEV);
+/*
+ * move the inode from it's current bdi to the a new bdi. if the inode is dirty
+ * we need to move it onto the dirty list of @dst so that the inode is always
+ * on the right list.
+ */
+static void bdev_inode_switch_bdi(struct inode *inode,
+ struct backing_dev_info *dst)
+{
+ spin_lock(&inode_lock);
+ inode->i_data.backing_dev_info = dst;
+ if (inode->i_state & I_DIRTY)
+ list_move(&inode->i_wb_list, &dst->wb.b_dirty);
+ spin_unlock(&inode_lock);
+}
+
static sector_t max_block(struct block_device *bdev)
{
sector_t retval = ~((sector_t)0);
@@ -550,7 +565,7 @@
*/
struct block_device *bdgrab(struct block_device *bdev)
{
- atomic_inc(&bdev->bd_inode->i_count);
+ ihold(bdev->bd_inode);
return bdev;
}
@@ -580,7 +595,7 @@
spin_lock(&bdev_lock);
bdev = inode->i_bdev;
if (bdev) {
- atomic_inc(&bdev->bd_inode->i_count);
+ ihold(bdev->bd_inode);
spin_unlock(&bdev_lock);
return bdev;
}
@@ -591,12 +606,12 @@
spin_lock(&bdev_lock);
if (!inode->i_bdev) {
/*
- * We take an additional bd_inode->i_count for inode,
+ * We take an additional reference to bd_inode,
* and it's released in clear_inode() of inode.
* So, we can access it via ->i_mapping always
* without igrab().
*/
- atomic_inc(&bdev->bd_inode->i_count);
+ ihold(bdev->bd_inode);
inode->i_bdev = bdev;
inode->i_mapping = bdev->bd_inode->i_mapping;
list_add(&inode->i_devices, &bdev->bd_inodes);
@@ -1390,7 +1405,7 @@
bdi = blk_get_backing_dev_info(bdev);
if (bdi == NULL)
bdi = &default_backing_dev_info;
- bdev->bd_inode->i_data.backing_dev_info = bdi;
+ bdev_inode_switch_bdi(bdev->bd_inode, bdi);
}
if (bdev->bd_invalidated)
rescan_partitions(disk, bdev);
@@ -1405,8 +1420,8 @@
if (ret)
goto out_clear;
bdev->bd_contains = whole;
- bdev->bd_inode->i_data.backing_dev_info =
- whole->bd_inode->i_data.backing_dev_info;
+ bdev_inode_switch_bdi(bdev->bd_inode,
+ whole->bd_inode->i_data.backing_dev_info);
bdev->bd_part = disk_get_part(disk, partno);
if (!(disk->flags & GENHD_FL_UP) ||
!bdev->bd_part || !bdev->bd_part->nr_sects) {
@@ -1439,7 +1454,7 @@
disk_put_part(bdev->bd_part);
bdev->bd_disk = NULL;
bdev->bd_part = NULL;
- bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
+ bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
if (bdev != bdev->bd_contains)
__blkdev_put(bdev->bd_contains, mode, 1);
bdev->bd_contains = NULL;
@@ -1533,7 +1548,8 @@
disk_put_part(bdev->bd_part);
bdev->bd_part = NULL;
bdev->bd_disk = NULL;
- bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
+ bdev_inode_switch_bdi(bdev->bd_inode,
+ &default_backing_dev_info);
if (bdev != bdev->bd_contains)
victim = bdev->bd_contains;
bdev->bd_contains = NULL;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c038644..64f99cf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3849,7 +3849,7 @@
p = &root->inode_tree.rb_node;
parent = NULL;
- if (hlist_unhashed(&inode->i_hash))
+ if (inode_unhashed(inode))
return;
spin_lock(&root->inode_lock);
@@ -4758,7 +4758,7 @@
}
btrfs_set_trans_block_group(trans, dir);
- atomic_inc(&inode->i_count);
+ ihold(inode);
err = btrfs_add_nondir(trans, dentry, inode, 1, index);
diff --git a/fs/buffer.c b/fs/buffer.c
index 8d595ab..5930e38 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1833,9 +1833,11 @@
}
EXPORT_SYMBOL(page_zero_new_buffers);
-int block_prepare_write(struct page *page, unsigned from, unsigned to,
+int __block_write_begin(struct page *page, loff_t pos, unsigned len,
get_block_t *get_block)
{
+ unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned to = from + len;
struct inode *inode = page->mapping->host;
unsigned block_start, block_end;
sector_t block;
@@ -1915,7 +1917,7 @@
}
return err;
}
-EXPORT_SYMBOL(block_prepare_write);
+EXPORT_SYMBOL(__block_write_begin);
static int __block_commit_write(struct inode *inode, struct page *page,
unsigned from, unsigned to)
@@ -1952,15 +1954,6 @@
return 0;
}
-int __block_write_begin(struct page *page, loff_t pos, unsigned len,
- get_block_t *get_block)
-{
- unsigned start = pos & (PAGE_CACHE_SIZE - 1);
-
- return block_prepare_write(page, start, start + len, get_block);
-}
-EXPORT_SYMBOL(__block_write_begin);
-
/*
* block_write_begin takes care of the basic task of block allocation and
* bringing partial write blocks uptodate first.
@@ -2378,7 +2371,7 @@
else
end = PAGE_CACHE_SIZE;
- ret = block_prepare_write(page, 0, end, get_block);
+ ret = __block_write_begin(page, 0, end, get_block);
if (!ret)
ret = block_commit_write(page, 0, end);
@@ -2465,11 +2458,10 @@
*fsdata = NULL;
if (page_has_buffers(page)) {
- unlock_page(page);
- page_cache_release(page);
- *pagep = NULL;
- return block_write_begin(mapping, pos, len, flags, pagep,
- get_block);
+ ret = __block_write_begin(page, pos, len, get_block);
+ if (unlikely(ret))
+ goto out_release;
+ return ret;
}
if (PageMappedToDisk(page))
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 96fbeab..5d8b355 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -276,7 +276,7 @@
}
coda_dir_update_mtime(dir_inode);
- atomic_inc(&inode->i_count);
+ ihold(inode);
d_instantiate(de, inode);
inc_nlink(inode);
return 0;
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index cf78d44..253476d 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -135,6 +135,7 @@
{
struct inode * inode = new_inode(configfs_sb);
if (inode) {
+ inode->i_ino = get_next_ino();
inode->i_mapping->a_ops = &configfs_aops;
inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
inode->i_op = &configfs_inode_operations;
diff --git a/fs/dcache.c b/fs/dcache.c
index 83293be..23702a9 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -67,33 +67,43 @@
.age_limit = 45,
};
-static void __d_free(struct dentry *dentry)
+static struct percpu_counter nr_dentry __cacheline_aligned_in_smp;
+static struct percpu_counter nr_dentry_unused __cacheline_aligned_in_smp;
+
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
+int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
+ size_t *lenp, loff_t *ppos)
{
+ dentry_stat.nr_dentry = percpu_counter_sum_positive(&nr_dentry);
+ dentry_stat.nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
+ return proc_dointvec(table, write, buffer, lenp, ppos);
+}
+#endif
+
+static void __d_free(struct rcu_head *head)
+{
+ struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
+
WARN_ON(!list_empty(&dentry->d_alias));
if (dname_external(dentry))
kfree(dentry->d_name.name);
kmem_cache_free(dentry_cache, dentry);
}
-static void d_callback(struct rcu_head *head)
-{
- struct dentry * dentry = container_of(head, struct dentry, d_u.d_rcu);
- __d_free(dentry);
-}
-
/*
- * no dcache_lock, please. The caller must decrement dentry_stat.nr_dentry
- * inside dcache_lock.
+ * no dcache_lock, please.
*/
static void d_free(struct dentry *dentry)
{
+ percpu_counter_dec(&nr_dentry);
if (dentry->d_op && dentry->d_op->d_release)
dentry->d_op->d_release(dentry);
+
/* if dentry was never inserted into hash, immediate free is OK */
if (hlist_unhashed(&dentry->d_hash))
- __d_free(dentry);
+ __d_free(&dentry->d_u.d_rcu);
else
- call_rcu(&dentry->d_u.d_rcu, d_callback);
+ call_rcu(&dentry->d_u.d_rcu, __d_free);
}
/*
@@ -123,37 +133,34 @@
}
/*
- * dentry_lru_(add|add_tail|del|del_init) must be called with dcache_lock held.
+ * dentry_lru_(add|del|move_tail) must be called with dcache_lock held.
*/
static void dentry_lru_add(struct dentry *dentry)
{
- list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
- dentry->d_sb->s_nr_dentry_unused++;
- dentry_stat.nr_unused++;
-}
-
-static void dentry_lru_add_tail(struct dentry *dentry)
-{
- list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
- dentry->d_sb->s_nr_dentry_unused++;
- dentry_stat.nr_unused++;
+ if (list_empty(&dentry->d_lru)) {
+ list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
+ dentry->d_sb->s_nr_dentry_unused++;
+ percpu_counter_inc(&nr_dentry_unused);
+ }
}
static void dentry_lru_del(struct dentry *dentry)
{
if (!list_empty(&dentry->d_lru)) {
- list_del(&dentry->d_lru);
+ list_del_init(&dentry->d_lru);
dentry->d_sb->s_nr_dentry_unused--;
- dentry_stat.nr_unused--;
+ percpu_counter_dec(&nr_dentry_unused);
}
}
-static void dentry_lru_del_init(struct dentry *dentry)
+static void dentry_lru_move_tail(struct dentry *dentry)
{
- if (likely(!list_empty(&dentry->d_lru))) {
- list_del_init(&dentry->d_lru);
- dentry->d_sb->s_nr_dentry_unused--;
- dentry_stat.nr_unused--;
+ if (list_empty(&dentry->d_lru)) {
+ list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
+ dentry->d_sb->s_nr_dentry_unused++;
+ percpu_counter_inc(&nr_dentry_unused);
+ } else {
+ list_move_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
}
}
@@ -172,7 +179,6 @@
struct dentry *parent;
list_del(&dentry->d_u.d_child);
- dentry_stat.nr_dentry--; /* For d_free, below */
/*drops the locks, at that point nobody can reach this dentry */
dentry_iput(dentry);
if (IS_ROOT(dentry))
@@ -237,13 +243,15 @@
if (dentry->d_op->d_delete(dentry))
goto unhash_it;
}
+
/* Unreachable? Get rid of it */
if (d_unhashed(dentry))
goto kill_it;
- if (list_empty(&dentry->d_lru)) {
- dentry->d_flags |= DCACHE_REFERENCED;
- dentry_lru_add(dentry);
- }
+
+ /* Otherwise leave it cached and ensure it's on the LRU */
+ dentry->d_flags |= DCACHE_REFERENCED;
+ dentry_lru_add(dentry);
+
spin_unlock(&dentry->d_lock);
spin_unlock(&dcache_lock);
return;
@@ -318,11 +326,10 @@
EXPORT_SYMBOL(d_invalidate);
/* This should be called _only_ with dcache_lock held */
-
static inline struct dentry * __dget_locked(struct dentry *dentry)
{
atomic_inc(&dentry->d_count);
- dentry_lru_del_init(dentry);
+ dentry_lru_del(dentry);
return dentry;
}
@@ -441,73 +448,27 @@
if (dentry->d_op && dentry->d_op->d_delete)
dentry->d_op->d_delete(dentry);
- dentry_lru_del_init(dentry);
+ dentry_lru_del(dentry);
__d_drop(dentry);
dentry = d_kill(dentry);
spin_lock(&dcache_lock);
}
}
-/*
- * Shrink the dentry LRU on a given superblock.
- * @sb : superblock to shrink dentry LRU.
- * @count: If count is NULL, we prune all dentries on superblock.
- * @flags: If flags is non-zero, we need to do special processing based on
- * which flags are set. This means we don't need to maintain multiple
- * similar copies of this loop.
- */
-static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
+static void shrink_dentry_list(struct list_head *list)
{
- LIST_HEAD(referenced);
- LIST_HEAD(tmp);
struct dentry *dentry;
- int cnt = 0;
- BUG_ON(!sb);
- BUG_ON((flags & DCACHE_REFERENCED) && count == NULL);
- spin_lock(&dcache_lock);
- if (count != NULL)
- /* called from prune_dcache() and shrink_dcache_parent() */
- cnt = *count;
-restart:
- if (count == NULL)
- list_splice_init(&sb->s_dentry_lru, &tmp);
- else {
- while (!list_empty(&sb->s_dentry_lru)) {
- dentry = list_entry(sb->s_dentry_lru.prev,
- struct dentry, d_lru);
- BUG_ON(dentry->d_sb != sb);
+ while (!list_empty(list)) {
+ dentry = list_entry(list->prev, struct dentry, d_lru);
+ dentry_lru_del(dentry);
- spin_lock(&dentry->d_lock);
- /*
- * If we are honouring the DCACHE_REFERENCED flag and
- * the dentry has this flag set, don't free it. Clear
- * the flag and put it back on the LRU.
- */
- if ((flags & DCACHE_REFERENCED)
- && (dentry->d_flags & DCACHE_REFERENCED)) {
- dentry->d_flags &= ~DCACHE_REFERENCED;
- list_move(&dentry->d_lru, &referenced);
- spin_unlock(&dentry->d_lock);
- } else {
- list_move_tail(&dentry->d_lru, &tmp);
- spin_unlock(&dentry->d_lock);
- cnt--;
- if (!cnt)
- break;
- }
- cond_resched_lock(&dcache_lock);
- }
- }
- while (!list_empty(&tmp)) {
- dentry = list_entry(tmp.prev, struct dentry, d_lru);
- dentry_lru_del_init(dentry);
- spin_lock(&dentry->d_lock);
/*
* We found an inuse dentry which was not removed from
* the LRU because of laziness during lookup. Do not free
* it - just keep it off the LRU list.
*/
+ spin_lock(&dentry->d_lock);
if (atomic_read(&dentry->d_count)) {
spin_unlock(&dentry->d_lock);
continue;
@@ -516,13 +477,60 @@
/* dentry->d_lock was dropped in prune_one_dentry() */
cond_resched_lock(&dcache_lock);
}
- if (count == NULL && !list_empty(&sb->s_dentry_lru))
- goto restart;
- if (count != NULL)
- *count = cnt;
+}
+
+/**
+ * __shrink_dcache_sb - shrink the dentry LRU on a given superblock
+ * @sb: superblock to shrink dentry LRU.
+ * @count: number of entries to prune
+ * @flags: flags to control the dentry processing
+ *
+ * If flags contains DCACHE_REFERENCED reference dentries will not be pruned.
+ */
+static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
+{
+ /* called from prune_dcache() and shrink_dcache_parent() */
+ struct dentry *dentry;
+ LIST_HEAD(referenced);
+ LIST_HEAD(tmp);
+ int cnt = *count;
+
+ spin_lock(&dcache_lock);
+ while (!list_empty(&sb->s_dentry_lru)) {
+ dentry = list_entry(sb->s_dentry_lru.prev,
+ struct dentry, d_lru);
+ BUG_ON(dentry->d_sb != sb);
+
+ /*
+ * If we are honouring the DCACHE_REFERENCED flag and the
+ * dentry has this flag set, don't free it. Clear the flag
+ * and put it back on the LRU.
+ */
+ if (flags & DCACHE_REFERENCED) {
+ spin_lock(&dentry->d_lock);
+ if (dentry->d_flags & DCACHE_REFERENCED) {
+ dentry->d_flags &= ~DCACHE_REFERENCED;
+ list_move(&dentry->d_lru, &referenced);
+ spin_unlock(&dentry->d_lock);
+ cond_resched_lock(&dcache_lock);
+ continue;
+ }
+ spin_unlock(&dentry->d_lock);
+ }
+
+ list_move_tail(&dentry->d_lru, &tmp);
+ if (!--cnt)
+ break;
+ cond_resched_lock(&dcache_lock);
+ }
+
+ *count = cnt;
+ shrink_dentry_list(&tmp);
+
if (!list_empty(&referenced))
list_splice(&referenced, &sb->s_dentry_lru);
spin_unlock(&dcache_lock);
+
}
/**
@@ -538,7 +546,7 @@
{
struct super_block *sb, *p = NULL;
int w_count;
- int unused = dentry_stat.nr_unused;
+ int unused = percpu_counter_sum_positive(&nr_dentry_unused);
int prune_ratio;
int pruned;
@@ -608,13 +616,19 @@
* shrink_dcache_sb - shrink dcache for a superblock
* @sb: superblock
*
- * Shrink the dcache for the specified super block. This
- * is used to free the dcache before unmounting a file
- * system
+ * Shrink the dcache for the specified super block. This is used to free
+ * the dcache before unmounting a file system.
*/
-void shrink_dcache_sb(struct super_block * sb)
+void shrink_dcache_sb(struct super_block *sb)
{
- __shrink_dcache_sb(sb, NULL, 0);
+ LIST_HEAD(tmp);
+
+ spin_lock(&dcache_lock);
+ while (!list_empty(&sb->s_dentry_lru)) {
+ list_splice_init(&sb->s_dentry_lru, &tmp);
+ shrink_dentry_list(&tmp);
+ }
+ spin_unlock(&dcache_lock);
}
EXPORT_SYMBOL(shrink_dcache_sb);
@@ -632,7 +646,7 @@
/* detach this root from the system */
spin_lock(&dcache_lock);
- dentry_lru_del_init(dentry);
+ dentry_lru_del(dentry);
__d_drop(dentry);
spin_unlock(&dcache_lock);
@@ -646,7 +660,7 @@
spin_lock(&dcache_lock);
list_for_each_entry(loop, &dentry->d_subdirs,
d_u.d_child) {
- dentry_lru_del_init(loop);
+ dentry_lru_del(loop);
__d_drop(loop);
cond_resched_lock(&dcache_lock);
}
@@ -703,20 +717,13 @@
* otherwise we ascend to the parent and move to the
* next sibling if there is one */
if (!parent)
- goto out;
-
+ return;
dentry = parent;
-
} while (list_empty(&dentry->d_subdirs));
dentry = list_entry(dentry->d_subdirs.next,
struct dentry, d_u.d_child);
}
-out:
- /* several dentries were freed, need to correct nr_dentry */
- spin_lock(&dcache_lock);
- dentry_stat.nr_dentry -= detached;
- spin_unlock(&dcache_lock);
}
/*
@@ -830,14 +837,15 @@
struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
next = tmp->next;
- dentry_lru_del_init(dentry);
/*
* move only zero ref count dentries to the end
* of the unused list for prune_dcache
*/
if (!atomic_read(&dentry->d_count)) {
- dentry_lru_add_tail(dentry);
+ dentry_lru_move_tail(dentry);
found++;
+ } else {
+ dentry_lru_del(dentry);
}
/*
@@ -900,12 +908,16 @@
*/
static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
{
+ int nr_unused;
+
if (nr) {
if (!(gfp_mask & __GFP_FS))
return -1;
prune_dcache(nr);
}
- return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
+
+ nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
+ return (nr_unused / 100) * sysctl_vfs_cache_pressure;
}
static struct shrinker dcache_shrinker = {
@@ -972,9 +984,10 @@
spin_lock(&dcache_lock);
if (parent)
list_add(&dentry->d_u.d_child, &parent->d_subdirs);
- dentry_stat.nr_dentry++;
spin_unlock(&dcache_lock);
+ percpu_counter_inc(&nr_dentry);
+
return dentry;
}
EXPORT_SYMBOL(d_alloc);
@@ -1478,33 +1491,26 @@
* This is used by ncpfs in its readdir implementation.
* Zero is returned in the dentry is invalid.
*/
-
-int d_validate(struct dentry *dentry, struct dentry *dparent)
+int d_validate(struct dentry *dentry, struct dentry *parent)
{
- struct hlist_head *base;
- struct hlist_node *lhp;
+ struct hlist_head *head = d_hash(parent, dentry->d_name.hash);
+ struct hlist_node *node;
+ struct dentry *d;
/* Check whether the ptr might be valid at all.. */
if (!kmem_ptr_validate(dentry_cache, dentry))
- goto out;
+ return 0;
+ if (dentry->d_parent != parent)
+ return 0;
- if (dentry->d_parent != dparent)
- goto out;
-
- spin_lock(&dcache_lock);
- base = d_hash(dparent, dentry->d_name.hash);
- hlist_for_each(lhp,base) {
- /* hlist_for_each_entry_rcu() not required for d_hash list
- * as it is parsed under dcache_lock
- */
- if (dentry == hlist_entry(lhp, struct dentry, d_hash)) {
- __dget_locked(dentry);
- spin_unlock(&dcache_lock);
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(d, node, head, d_hash) {
+ if (d == dentry) {
+ dget(dentry);
return 1;
}
}
- spin_unlock(&dcache_lock);
-out:
+ rcu_read_unlock();
return 0;
}
EXPORT_SYMBOL(d_validate);
@@ -1994,7 +2000,7 @@
* Returns a pointer into the buffer or an error code if the
* path was too long.
*
- * "buflen" should be positive. Caller holds the dcache_lock.
+ * "buflen" should be positive.
*
* If path is not reachable from the supplied root, then the value of
* root is changed (without modifying refcounts).
@@ -2006,10 +2012,12 @@
int error;
prepend(&res, &buflen, "\0", 1);
+ spin_lock(&dcache_lock);
error = prepend_path(path, root, &res, &buflen);
+ spin_unlock(&dcache_lock);
+
if (error)
return ERR_PTR(error);
-
return res;
}
@@ -2419,6 +2427,9 @@
{
int loop;
+ percpu_counter_init(&nr_dentry, 0);
+ percpu_counter_init(&nr_dentry_unused, 0);
+
/*
* A constructor could be added for stable state like the lists,
* but it is probably not worth it because of the cache nature
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 30a87b3..a4ed838 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -40,6 +40,7 @@
struct inode *inode = new_inode(sb);
if (inode) {
+ inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
switch (mode & S_IFMT) {
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 68cb23e..b905c79 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -46,10 +46,6 @@
{
int ret;
struct inode *inode = filp->f_mapping->host;
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = 0, /* metadata-only; caller takes care of data */
- };
struct super_block *sb;
if (!(inode->i_state & I_DIRTY))
@@ -57,7 +53,7 @@
if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
return 0;
- ret = sync_inode(inode, &wbc);
+ ret = sync_inode_metadata(inode, 1);
/* This is a good place to write the sb */
/* TODO: Sechedule an sb-sync on create */
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index b7dd0c2..264e95d 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -153,7 +153,7 @@
inode->i_ctime = CURRENT_TIME;
inode_inc_link_count(inode);
- atomic_inc(&inode->i_count);
+ ihold(inode);
return exofs_add_nondir(dentry, inode);
}
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index e9e1759..51b3040 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -74,21 +74,20 @@
find_disconnected_root(struct dentry *dentry)
{
dget(dentry);
- spin_lock(&dentry->d_lock);
- while (!IS_ROOT(dentry) &&
- (dentry->d_parent->d_flags & DCACHE_DISCONNECTED)) {
- struct dentry *parent = dentry->d_parent;
- dget(parent);
- spin_unlock(&dentry->d_lock);
+ while (!IS_ROOT(dentry)) {
+ struct dentry *parent = dget_parent(dentry);
+
+ if (!(parent->d_flags & DCACHE_DISCONNECTED)) {
+ dput(parent);
+ break;
+ }
+
dput(dentry);
dentry = parent;
- spin_lock(&dentry->d_lock);
}
- spin_unlock(&dentry->d_lock);
return dentry;
}
-
/*
* Make sure target_dir is fully connected to the dentry tree.
*
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 7641098..2709b34 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -98,7 +98,7 @@
if (IS_DIRSYNC(dir)) {
err = write_one_page(page, 1);
if (!err)
- err = ext2_sync_inode(dir);
+ err = sync_inode_metadata(dir, 1);
} else {
unlock_page(page);
}
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 416daa6..6346a2a 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -120,7 +120,6 @@
extern struct inode *ext2_iget (struct super_block *, unsigned long);
extern int ext2_write_inode (struct inode *, struct writeback_control *);
extern void ext2_evict_inode(struct inode *);
-extern int ext2_sync_inode (struct inode *);
extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
extern int ext2_setattr (struct dentry *, struct iattr *);
extern void ext2_set_inode_flags(struct inode *inode);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 533699c..40ad210 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1203,7 +1203,7 @@
inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
if (inode_needs_sync(inode)) {
sync_mapping_buffers(inode->i_mapping);
- ext2_sync_inode (inode);
+ sync_inode_metadata(inode, 1);
} else {
mark_inode_dirty(inode);
}
@@ -1523,15 +1523,6 @@
return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
}
-int ext2_sync_inode(struct inode *inode)
-{
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = 0, /* sys_fsync did this */
- };
- return sync_inode(inode, &wbc);
-}
-
int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
{
struct inode *inode = dentry->d_inode;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 71efb0e..f8aecd2 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -206,7 +206,7 @@
inode->i_ctime = CURRENT_TIME_SEC;
inode_inc_link_count(inode);
- atomic_inc(&inode->i_count);
+ ihold(inode);
err = ext2_add_link(dentry, inode);
if (!err) {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 85df87d..0901320 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1221,9 +1221,7 @@
}
es = sbi->s_es;
- if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) !=
- (old_mount_opt & EXT2_MOUNT_XIP)) &&
- invalidate_inodes(sb)) {
+ if ((sbi->s_mount_opt ^ old_mount_opt) & EXT2_MOUNT_XIP) {
ext2_msg(sb, KERN_WARNING, "warning: refusing change of "
"xip flag with busy inodes while remounting");
sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 8c29ae1..f84700b 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -699,7 +699,7 @@
EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
inode->i_ctime = CURRENT_TIME_SEC;
if (IS_SYNC(inode)) {
- error = ext2_sync_inode (inode);
+ error = sync_inode_metadata(inode, 1);
/* In case sync failed due to ENOSPC the inode was actually
* written (only some dirty data were not) so we just proceed
* as if nothing happened and cleanup the unused block */
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 5e0faf4..ad05353 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1696,8 +1696,8 @@
* doesn't seem much point in redirtying the page here.
*/
ClearPageChecked(page);
- ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
- ext3_get_block);
+ ret = __block_write_begin(page, 0, PAGE_CACHE_SIZE,
+ ext3_get_block);
if (ret != 0) {
ext3_journal_stop(handle);
goto out_unlock;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 2b35ddb..bce9dce 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2260,7 +2260,7 @@
inode->i_ctime = CURRENT_TIME_SEC;
inc_nlink(inode);
- atomic_inc(&inode->i_count);
+ ihold(inode);
err = ext3_add_entry(handle, dentry, inode);
if (!err) {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4b8debe..49635ef 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1538,10 +1538,10 @@
if (!buffer_mapped(bh) || buffer_freed(bh))
return 0;
/*
- * __block_prepare_write() could have dirtied some buffers. Clean
+ * __block_write_begin() could have dirtied some buffers. Clean
* the dirty bit as jbd2_journal_get_write_access() could complain
* otherwise about fs integrity issues. Setting of the dirty bit
- * by __block_prepare_write() isn't a real problem here as we clear
+ * by __block_write_begin() isn't a real problem here as we clear
* the bit before releasing a page lock and thus writeback cannot
* ever write the buffer.
*/
@@ -2550,8 +2550,7 @@
if (buffer_delay(bh))
return 0; /* Not sure this could or should happen */
/*
- * XXX: __block_prepare_write() unmaps passed block,
- * is it OK?
+ * XXX: __block_write_begin() unmaps passed block, is it OK?
*/
ret = ext4_da_reserve_space(inode, iblock);
if (ret)
@@ -2583,7 +2582,7 @@
/*
* This function is used as a standard get_block_t calback function
* when there is no desire to allocate any blocks. It is used as a
- * callback function for block_prepare_write() and block_write_full_page().
+ * callback function for block_write_begin() and block_write_full_page().
* These functions should only try to map a single block at a time.
*
* Since this function doesn't do block allocations even if the caller
@@ -2743,7 +2742,7 @@
* all are mapped and non delay. We don't want to
* do block allocation here.
*/
- ret = block_prepare_write(page, 0, len,
+ ret = __block_write_begin(page, 0, len,
noalloc_get_block_write);
if (!ret) {
page_bufs = page_buffers(page);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 19aa0d4..42f77b1 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2373,6 +2373,7 @@
printk(KERN_ERR "EXT4-fs: can't get new inode\n");
goto err_freesgi;
}
+ sbi->s_buddy_cache->i_ino = get_next_ino();
EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
for (i = 0; i < ngroups; i++) {
desc = ext4_get_group_desc(sb, i, NULL);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 314c0d3..bd39885 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2312,7 +2312,7 @@
inode->i_ctime = ext4_current_time(inode);
ext4_inc_count(handle, inode);
- atomic_inc(&inode->i_count);
+ ihold(inode);
err = ext4_add_entry(handle, dentry, inode);
if (!err) {
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 79d1b4e..8c04eac 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -260,6 +260,7 @@
struct inode *ip = NULL;
if ((ip = new_inode(sbp))) {
+ ip->i_ino = get_next_ino();
vxfs_iinit(ip, vip);
ip->i_mapping->a_ops = &vxfs_aops;
}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 9e46aec..aed881a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -79,6 +79,11 @@
return sb->s_bdi;
}
+static inline struct inode *wb_inode(struct list_head *head)
+{
+ return list_entry(head, struct inode, i_wb_list);
+}
+
static void bdi_queue_work(struct backing_dev_info *bdi,
struct wb_writeback_work *work)
{
@@ -172,11 +177,11 @@
if (!list_empty(&wb->b_dirty)) {
struct inode *tail;
- tail = list_entry(wb->b_dirty.next, struct inode, i_list);
+ tail = wb_inode(wb->b_dirty.next);
if (time_before(inode->dirtied_when, tail->dirtied_when))
inode->dirtied_when = jiffies;
}
- list_move(&inode->i_list, &wb->b_dirty);
+ list_move(&inode->i_wb_list, &wb->b_dirty);
}
/*
@@ -186,7 +191,7 @@
{
struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
- list_move(&inode->i_list, &wb->b_more_io);
+ list_move(&inode->i_wb_list, &wb->b_more_io);
}
static void inode_sync_complete(struct inode *inode)
@@ -227,14 +232,14 @@
int do_sb_sort = 0;
while (!list_empty(delaying_queue)) {
- inode = list_entry(delaying_queue->prev, struct inode, i_list);
+ inode = wb_inode(delaying_queue->prev);
if (older_than_this &&
inode_dirtied_after(inode, *older_than_this))
break;
if (sb && sb != inode->i_sb)
do_sb_sort = 1;
sb = inode->i_sb;
- list_move(&inode->i_list, &tmp);
+ list_move(&inode->i_wb_list, &tmp);
}
/* just one sb in list, splice to dispatch_queue and we're done */
@@ -245,12 +250,11 @@
/* Move inodes from one superblock together */
while (!list_empty(&tmp)) {
- inode = list_entry(tmp.prev, struct inode, i_list);
- sb = inode->i_sb;
+ sb = wb_inode(tmp.prev)->i_sb;
list_for_each_prev_safe(pos, node, &tmp) {
- inode = list_entry(pos, struct inode, i_list);
+ inode = wb_inode(pos);
if (inode->i_sb == sb)
- list_move(&inode->i_list, dispatch_queue);
+ list_move(&inode->i_wb_list, dispatch_queue);
}
}
}
@@ -408,16 +412,13 @@
* completion.
*/
redirty_tail(inode);
- } else if (atomic_read(&inode->i_count)) {
- /*
- * The inode is clean, inuse
- */
- list_move(&inode->i_list, &inode_in_use);
} else {
/*
- * The inode is clean, unused
+ * The inode is clean. At this point we either have
+ * a reference to the inode or it's on it's way out.
+ * No need to add it back to the LRU.
*/
- list_move(&inode->i_list, &inode_unused);
+ list_del_init(&inode->i_wb_list);
}
}
inode_sync_complete(inode);
@@ -465,8 +466,7 @@
{
while (!list_empty(&wb->b_io)) {
long pages_skipped;
- struct inode *inode = list_entry(wb->b_io.prev,
- struct inode, i_list);
+ struct inode *inode = wb_inode(wb->b_io.prev);
if (inode->i_sb != sb) {
if (only_this_sb) {
@@ -487,10 +487,16 @@
return 0;
}
- if (inode->i_state & (I_NEW | I_WILL_FREE)) {
+ /*
+ * Don't bother with new inodes or inodes beeing freed, first
+ * kind does not need peridic writeout yet, and for the latter
+ * kind writeout is handled by the freer.
+ */
+ if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
requeue_io(inode);
continue;
}
+
/*
* Was this inode dirtied after sync_sb_inodes was called?
* This keeps sync from extra jobs and livelock.
@@ -498,7 +504,6 @@
if (inode_dirtied_after(inode, wbc->wb_start))
return 1;
- BUG_ON(inode->i_state & I_FREEING);
__iget(inode);
pages_skipped = wbc->pages_skipped;
writeback_single_inode(inode, wbc);
@@ -536,8 +541,7 @@
queue_io(wb, wbc->older_than_this);
while (!list_empty(&wb->b_io)) {
- struct inode *inode = list_entry(wb->b_io.prev,
- struct inode, i_list);
+ struct inode *inode = wb_inode(wb->b_io.prev);
struct super_block *sb = inode->i_sb;
if (!pin_sb_for_writeback(sb)) {
@@ -675,8 +679,7 @@
*/
spin_lock(&inode_lock);
if (!list_empty(&wb->b_more_io)) {
- inode = list_entry(wb->b_more_io.prev,
- struct inode, i_list);
+ inode = wb_inode(wb->b_more_io.prev);
trace_wbc_writeback_wait(&wbc, wb->bdi);
inode_wait_for_writeback(inode);
}
@@ -727,7 +730,7 @@
*/
nr_pages = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS) +
- (inodes_stat.nr_inodes - inodes_stat.nr_unused);
+ get_nr_dirty_inodes();
if (nr_pages) {
struct wb_writeback_work work = {
@@ -966,7 +969,7 @@
* dirty list. Add blockdev inodes as well.
*/
if (!S_ISBLK(inode->i_mode)) {
- if (hlist_unhashed(&inode->i_hash))
+ if (inode_unhashed(inode))
goto out;
}
if (inode->i_state & I_FREEING)
@@ -994,7 +997,7 @@
}
inode->dirtied_when = jiffies;
- list_move(&inode->i_list, &bdi->wb.b_dirty);
+ list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
}
}
out:
@@ -1094,8 +1097,7 @@
WARN_ON(!rwsem_is_locked(&sb->s_umount));
- work.nr_pages = nr_dirty + nr_unstable +
- (inodes_stat.nr_inodes - inodes_stat.nr_unused);
+ work.nr_pages = nr_dirty + nr_unstable + get_nr_dirty_inodes();
bdi_queue_work(sb->s_bdi, &work);
wait_for_completion(&done);
@@ -1202,3 +1204,23 @@
return ret;
}
EXPORT_SYMBOL(sync_inode);
+
+/**
+ * sync_inode - write an inode to disk
+ * @inode: the inode to sync
+ * @wait: wait for I/O to complete.
+ *
+ * Write an inode to disk and adjust it's dirty state after completion.
+ *
+ * Note: only writes the actual inode, no associated data or other metadata.
+ */
+int sync_inode_metadata(struct inode *inode, int wait)
+{
+ struct writeback_control wbc = {
+ .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
+ .nr_to_write = 0, /* metadata-only */
+ };
+
+ return sync_inode(inode, &wbc);
+}
+EXPORT_SYMBOL(sync_inode_metadata);
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 7367e17..4eba076 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -222,6 +222,7 @@
if (!inode)
return NULL;
+ inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_uid = fc->user_id;
inode->i_gid = fc->group_id;
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 6b24afb..4f36f88 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -618,7 +618,6 @@
struct gfs2_alloc *al = NULL;
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
unsigned from = pos & (PAGE_CACHE_SIZE - 1);
- unsigned to = from + len;
struct page *page;
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
@@ -691,7 +690,7 @@
}
prepare_write:
- error = block_prepare_write(page, from, to, gfs2_block_map);
+ error = __block_write_begin(page, from, len, gfs2_block_map);
out:
if (error == 0)
return 0;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index aeafc23..cade1ac 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1219,7 +1219,6 @@
fail_locking:
init_locking(sdp, &mount_gh, UNDO);
fail_lm:
- invalidate_inodes(sb);
gfs2_gl_hash_clear(sdp);
gfs2_lm_unmount(sdp);
fail_sys:
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 0534510..12cbea7 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -255,7 +255,7 @@
gfs2_holder_uninit(ghs);
gfs2_holder_uninit(ghs + 1);
if (!error) {
- atomic_inc(&inode->i_count);
+ ihold(inode);
d_instantiate(dentry, inode);
mark_inode_dirty(inode);
}
@@ -1294,7 +1294,7 @@
int error;
if (!page_has_buffers(page)) {
- error = block_prepare_write(page, from, to, gfs2_block_map);
+ error = __block_write_begin(page, from, to - from, gfs2_block_map);
if (unlikely(error))
return error;
@@ -1313,7 +1313,7 @@
next += bh->b_size;
if (buffer_mapped(bh)) {
if (end) {
- error = block_prepare_write(page, start, end,
+ error = __block_write_begin(page, start, end - start,
gfs2_block_map);
if (unlikely(error))
return error;
@@ -1328,7 +1328,7 @@
} while (next < to);
if (end) {
- error = block_prepare_write(page, start, end, gfs2_block_map);
+ error = __block_write_begin(page, start, end - start, gfs2_block_map);
if (unlikely(error))
return error;
empty_write_end(page, start, end);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 047d117..2b2c499 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -857,7 +857,6 @@
gfs2_clear_rgrpd(sdp);
gfs2_jindex_free(sdp);
/* Take apart glock structures and buffer lists */
- invalidate_inodes(sdp->sd_vfs);
gfs2_gl_hash_clear(sdp);
/* Unmount the locking protocol */
gfs2_lm_unmount(sdp);
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 4f55651..c8cffb8 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -147,8 +147,6 @@
u16 blockoffset;
int fs_div;
-
- struct hlist_head rsrc_inodes;
};
#define HFS_FLG_BITMAP_DIRTY 0
@@ -254,17 +252,6 @@
sb->s_dirt = 1;
}
-static inline void hfs_buffer_sync(struct buffer_head *bh)
-{
- while (buffer_locked(bh)) {
- wait_on_buffer(bh);
- }
- if (buffer_dirty(bh)) {
- ll_rw_block(WRITE, 1, &bh);
- wait_on_buffer(bh);
- }
-}
-
#define sb_bread512(sb, sec, data) ({ \
struct buffer_head *__bh; \
sector_t __block; \
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 397b7ad..dffb4e9 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -524,7 +524,7 @@
HFS_I(inode)->rsrc_inode = dir;
HFS_I(dir)->rsrc_inode = inode;
igrab(dir);
- hlist_add_head(&inode->i_hash, &HFS_SB(dir->i_sb)->rsrc_inodes);
+ hlist_add_fake(&inode->i_hash);
mark_inode_dirty(inode);
out:
d_add(dentry, inode);
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 86428f5..1563d5c 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -220,7 +220,7 @@
mdb->drLsMod = hfs_mtime();
mark_buffer_dirty(HFS_SB(sb)->mdb_bh);
- hfs_buffer_sync(HFS_SB(sb)->mdb_bh);
+ sync_dirty_buffer(HFS_SB(sb)->mdb_bh);
}
return 0;
@@ -287,7 +287,7 @@
HFS_SB(sb)->alt_mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT);
HFS_SB(sb)->alt_mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT);
mark_buffer_dirty(HFS_SB(sb)->alt_mdb_bh);
- hfs_buffer_sync(HFS_SB(sb)->alt_mdb_bh);
+ sync_dirty_buffer(HFS_SB(sb)->alt_mdb_bh);
}
if (test_and_clear_bit(HFS_FLG_BITMAP_DIRTY, &HFS_SB(sb)->flags)) {
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 3325416..6ee1586 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -382,7 +382,6 @@
return -ENOMEM;
sb->s_fs_info = sbi;
- INIT_HLIST_HEAD(&sbi->rsrc_inodes);
res = -EINVAL;
if (!parse_options((char *)data, sbi)) {
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index d236d85..e318bbc 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -286,7 +286,7 @@
inc_nlink(inode);
hfsplus_instantiate(dst_dentry, inode, cnid);
- atomic_inc(&inode->i_count);
+ ihold(inode);
inode->i_ctime = CURRENT_TIME_SEC;
mark_inode_dirty(inode);
sbi->file_count++;
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 7844928..8afd7e8 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -211,7 +211,7 @@
* appear hashed, but do not put on any lists. hlist_del()
* will work fine and require no locking.
*/
- inode->i_hash.pprev = &inode->i_hash.next;
+ hlist_add_fake(&inode->i_hash);
mark_inode_dirty(inode);
out:
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a14328d..b14be3f 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -456,6 +456,7 @@
inode = new_inode(sb);
if (inode) {
struct hugetlbfs_inode_info *info;
+ inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_uid = uid;
inode->i_gid = gid;
diff --git a/fs/inode.c b/fs/inode.c
index 56d909d..ae2727a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -29,7 +29,6 @@
/*
* This is needed for the following functions:
* - inode_has_buffers
- * - invalidate_inode_buffers
* - invalidate_bdev
*
* FIXME: remove all knowledge of the buffer layer from this file
@@ -73,8 +72,7 @@
* allowing for low-overhead inode sync() operations.
*/
-LIST_HEAD(inode_in_use);
-LIST_HEAD(inode_unused);
+static LIST_HEAD(inode_lru);
static struct hlist_head *inode_hashtable __read_mostly;
/*
@@ -104,8 +102,41 @@
*/
struct inodes_stat_t inodes_stat;
+static struct percpu_counter nr_inodes __cacheline_aligned_in_smp;
+static struct percpu_counter nr_inodes_unused __cacheline_aligned_in_smp;
+
static struct kmem_cache *inode_cachep __read_mostly;
+static inline int get_nr_inodes(void)
+{
+ return percpu_counter_sum_positive(&nr_inodes);
+}
+
+static inline int get_nr_inodes_unused(void)
+{
+ return percpu_counter_sum_positive(&nr_inodes_unused);
+}
+
+int get_nr_dirty_inodes(void)
+{
+ int nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
+ return nr_dirty > 0 ? nr_dirty : 0;
+
+}
+
+/*
+ * Handle nr_inode sysctl
+ */
+#ifdef CONFIG_SYSCTL
+int proc_nr_inodes(ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ inodes_stat.nr_inodes = get_nr_inodes();
+ inodes_stat.nr_unused = get_nr_inodes_unused();
+ return proc_dointvec(table, write, buffer, lenp, ppos);
+}
+#endif
+
static void wake_up_inode(struct inode *inode)
{
/*
@@ -193,6 +224,8 @@
inode->i_fsnotify_mask = 0;
#endif
+ percpu_counter_inc(&nr_inodes);
+
return 0;
out:
return -ENOMEM;
@@ -233,11 +266,13 @@
if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
posix_acl_release(inode->i_default_acl);
#endif
+ percpu_counter_dec(&nr_inodes);
}
EXPORT_SYMBOL(__destroy_inode);
-void destroy_inode(struct inode *inode)
+static void destroy_inode(struct inode *inode)
{
+ BUG_ON(!list_empty(&inode->i_lru));
__destroy_inode(inode);
if (inode->i_sb->s_op->destroy_inode)
inode->i_sb->s_op->destroy_inode(inode);
@@ -256,6 +291,8 @@
INIT_HLIST_NODE(&inode->i_hash);
INIT_LIST_HEAD(&inode->i_dentry);
INIT_LIST_HEAD(&inode->i_devices);
+ INIT_LIST_HEAD(&inode->i_wb_list);
+ INIT_LIST_HEAD(&inode->i_lru);
INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
spin_lock_init(&inode->i_data.tree_lock);
spin_lock_init(&inode->i_data.i_mmap_lock);
@@ -282,14 +319,109 @@
*/
void __iget(struct inode *inode)
{
- if (atomic_inc_return(&inode->i_count) != 1)
- return;
-
- if (!(inode->i_state & (I_DIRTY|I_SYNC)))
- list_move(&inode->i_list, &inode_in_use);
- inodes_stat.nr_unused--;
+ atomic_inc(&inode->i_count);
}
+/*
+ * get additional reference to inode; caller must already hold one.
+ */
+void ihold(struct inode *inode)
+{
+ WARN_ON(atomic_inc_return(&inode->i_count) < 2);
+}
+EXPORT_SYMBOL(ihold);
+
+static void inode_lru_list_add(struct inode *inode)
+{
+ if (list_empty(&inode->i_lru)) {
+ list_add(&inode->i_lru, &inode_lru);
+ percpu_counter_inc(&nr_inodes_unused);
+ }
+}
+
+static void inode_lru_list_del(struct inode *inode)
+{
+ if (!list_empty(&inode->i_lru)) {
+ list_del_init(&inode->i_lru);
+ percpu_counter_dec(&nr_inodes_unused);
+ }
+}
+
+static inline void __inode_sb_list_add(struct inode *inode)
+{
+ list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
+}
+
+/**
+ * inode_sb_list_add - add inode to the superblock list of inodes
+ * @inode: inode to add
+ */
+void inode_sb_list_add(struct inode *inode)
+{
+ spin_lock(&inode_lock);
+ __inode_sb_list_add(inode);
+ spin_unlock(&inode_lock);
+}
+EXPORT_SYMBOL_GPL(inode_sb_list_add);
+
+static inline void __inode_sb_list_del(struct inode *inode)
+{
+ list_del_init(&inode->i_sb_list);
+}
+
+static unsigned long hash(struct super_block *sb, unsigned long hashval)
+{
+ unsigned long tmp;
+
+ tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
+ L1_CACHE_BYTES;
+ tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
+ return tmp & I_HASHMASK;
+}
+
+/**
+ * __insert_inode_hash - hash an inode
+ * @inode: unhashed inode
+ * @hashval: unsigned long value used to locate this object in the
+ * inode_hashtable.
+ *
+ * Add an inode to the inode hash for this superblock.
+ */
+void __insert_inode_hash(struct inode *inode, unsigned long hashval)
+{
+ struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
+
+ spin_lock(&inode_lock);
+ hlist_add_head(&inode->i_hash, b);
+ spin_unlock(&inode_lock);
+}
+EXPORT_SYMBOL(__insert_inode_hash);
+
+/**
+ * __remove_inode_hash - remove an inode from the hash
+ * @inode: inode to unhash
+ *
+ * Remove an inode from the superblock.
+ */
+static void __remove_inode_hash(struct inode *inode)
+{
+ hlist_del_init(&inode->i_hash);
+}
+
+/**
+ * remove_inode_hash - remove an inode from the hash
+ * @inode: inode to unhash
+ *
+ * Remove an inode from the superblock.
+ */
+void remove_inode_hash(struct inode *inode)
+{
+ spin_lock(&inode_lock);
+ hlist_del_init(&inode->i_hash);
+ spin_unlock(&inode_lock);
+}
+EXPORT_SYMBOL(remove_inode_hash);
+
void end_writeback(struct inode *inode)
{
might_sleep();
@@ -328,101 +460,113 @@
*/
static void dispose_list(struct list_head *head)
{
- int nr_disposed = 0;
-
while (!list_empty(head)) {
struct inode *inode;
- inode = list_first_entry(head, struct inode, i_list);
- list_del(&inode->i_list);
+ inode = list_first_entry(head, struct inode, i_lru);
+ list_del_init(&inode->i_lru);
evict(inode);
spin_lock(&inode_lock);
- hlist_del_init(&inode->i_hash);
- list_del_init(&inode->i_sb_list);
+ __remove_inode_hash(inode);
+ __inode_sb_list_del(inode);
spin_unlock(&inode_lock);
wake_up_inode(inode);
destroy_inode(inode);
- nr_disposed++;
}
- spin_lock(&inode_lock);
- inodes_stat.nr_inodes -= nr_disposed;
- spin_unlock(&inode_lock);
-}
-
-/*
- * Invalidate all inodes for a device.
- */
-static int invalidate_list(struct list_head *head, struct list_head *dispose)
-{
- struct list_head *next;
- int busy = 0, count = 0;
-
- next = head->next;
- for (;;) {
- struct list_head *tmp = next;
- struct inode *inode;
-
- /*
- * We can reschedule here without worrying about the list's
- * consistency because the per-sb list of inodes must not
- * change during umount anymore, and because iprune_sem keeps
- * shrink_icache_memory() away.
- */
- cond_resched_lock(&inode_lock);
-
- next = next->next;
- if (tmp == head)
- break;
- inode = list_entry(tmp, struct inode, i_sb_list);
- if (inode->i_state & I_NEW)
- continue;
- invalidate_inode_buffers(inode);
- if (!atomic_read(&inode->i_count)) {
- list_move(&inode->i_list, dispose);
- WARN_ON(inode->i_state & I_NEW);
- inode->i_state |= I_FREEING;
- count++;
- continue;
- }
- busy = 1;
- }
- /* only unused inodes may be cached with i_count zero */
- inodes_stat.nr_unused -= count;
- return busy;
}
/**
- * invalidate_inodes - discard the inodes on a device
- * @sb: superblock
+ * evict_inodes - evict all evictable inodes for a superblock
+ * @sb: superblock to operate on
*
- * Discard all of the inodes for a given superblock. If the discard
- * fails because there are busy inodes then a non zero value is returned.
- * If the discard is successful all the inodes have been discarded.
+ * Make sure that no inodes with zero refcount are retained. This is
+ * called by superblock shutdown after having MS_ACTIVE flag removed,
+ * so any inode reaching zero refcount during or after that call will
+ * be immediately evicted.
+ */
+void evict_inodes(struct super_block *sb)
+{
+ struct inode *inode, *next;
+ LIST_HEAD(dispose);
+
+ down_write(&iprune_sem);
+
+ spin_lock(&inode_lock);
+ list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
+ if (atomic_read(&inode->i_count))
+ continue;
+
+ if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+ WARN_ON(1);
+ continue;
+ }
+
+ inode->i_state |= I_FREEING;
+
+ /*
+ * Move the inode off the IO lists and LRU once I_FREEING is
+ * set so that it won't get moved back on there if it is dirty.
+ */
+ list_move(&inode->i_lru, &dispose);
+ list_del_init(&inode->i_wb_list);
+ if (!(inode->i_state & (I_DIRTY | I_SYNC)))
+ percpu_counter_dec(&nr_inodes_unused);
+ }
+ spin_unlock(&inode_lock);
+
+ dispose_list(&dispose);
+ up_write(&iprune_sem);
+}
+
+/**
+ * invalidate_inodes - attempt to free all inodes on a superblock
+ * @sb: superblock to operate on
+ *
+ * Attempts to free all inodes for a given superblock. If there were any
+ * busy inodes return a non-zero value, else zero.
*/
int invalidate_inodes(struct super_block *sb)
{
- int busy;
- LIST_HEAD(throw_away);
+ int busy = 0;
+ struct inode *inode, *next;
+ LIST_HEAD(dispose);
down_write(&iprune_sem);
+
spin_lock(&inode_lock);
- fsnotify_unmount_inodes(&sb->s_inodes);
- busy = invalidate_list(&sb->s_inodes, &throw_away);
+ list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
+ if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
+ continue;
+ if (atomic_read(&inode->i_count)) {
+ busy = 1;
+ continue;
+ }
+
+ inode->i_state |= I_FREEING;
+
+ /*
+ * Move the inode off the IO lists and LRU once I_FREEING is
+ * set so that it won't get moved back on there if it is dirty.
+ */
+ list_move(&inode->i_lru, &dispose);
+ list_del_init(&inode->i_wb_list);
+ if (!(inode->i_state & (I_DIRTY | I_SYNC)))
+ percpu_counter_dec(&nr_inodes_unused);
+ }
spin_unlock(&inode_lock);
- dispose_list(&throw_away);
+ dispose_list(&dispose);
up_write(&iprune_sem);
return busy;
}
-EXPORT_SYMBOL(invalidate_inodes);
static int can_unuse(struct inode *inode)
{
- if (inode->i_state)
+ if (inode->i_state & ~I_REFERENCED)
return 0;
if (inode_has_buffers(inode))
return 0;
@@ -434,22 +578,24 @@
}
/*
- * Scan `goal' inodes on the unused list for freeable ones. They are moved to
- * a temporary list and then are freed outside inode_lock by dispose_list().
+ * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
+ * temporary list and then are freed outside inode_lock by dispose_list().
*
* Any inodes which are pinned purely because of attached pagecache have their
- * pagecache removed. We expect the final iput() on that inode to add it to
- * the front of the inode_unused list. So look for it there and if the
- * inode is still freeable, proceed. The right inode is found 99.9% of the
- * time in testing on a 4-way.
+ * pagecache removed. If the inode has metadata buffers attached to
+ * mapping->private_list then try to remove them.
*
- * If the inode has metadata buffers attached to mapping->private_list then
- * try to remove them.
+ * If the inode has the I_REFERENCED flag set, then it means that it has been
+ * used recently - the flag is set in iput_final(). When we encounter such an
+ * inode, clear the flag and move it to the back of the LRU so it gets another
+ * pass through the LRU before it gets reclaimed. This is necessary because of
+ * the fact we are doing lazy LRU updates to minimise lock contention so the
+ * LRU does not have strict ordering. Hence we don't want to reclaim inodes
+ * with this flag set because they are the inodes that are out of order.
*/
static void prune_icache(int nr_to_scan)
{
LIST_HEAD(freeable);
- int nr_pruned = 0;
int nr_scanned;
unsigned long reap = 0;
@@ -458,13 +604,26 @@
for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
struct inode *inode;
- if (list_empty(&inode_unused))
+ if (list_empty(&inode_lru))
break;
- inode = list_entry(inode_unused.prev, struct inode, i_list);
+ inode = list_entry(inode_lru.prev, struct inode, i_lru);
- if (inode->i_state || atomic_read(&inode->i_count)) {
- list_move(&inode->i_list, &inode_unused);
+ /*
+ * Referenced or dirty inodes are still in use. Give them
+ * another pass through the LRU as we canot reclaim them now.
+ */
+ if (atomic_read(&inode->i_count) ||
+ (inode->i_state & ~I_REFERENCED)) {
+ list_del_init(&inode->i_lru);
+ percpu_counter_dec(&nr_inodes_unused);
+ continue;
+ }
+
+ /* recently referenced inodes get one more pass */
+ if (inode->i_state & I_REFERENCED) {
+ list_move(&inode->i_lru, &inode_lru);
+ inode->i_state &= ~I_REFERENCED;
continue;
}
if (inode_has_buffers(inode) || inode->i_data.nrpages) {
@@ -476,18 +635,23 @@
iput(inode);
spin_lock(&inode_lock);
- if (inode != list_entry(inode_unused.next,
- struct inode, i_list))
+ if (inode != list_entry(inode_lru.next,
+ struct inode, i_lru))
continue; /* wrong inode or list_empty */
if (!can_unuse(inode))
continue;
}
- list_move(&inode->i_list, &freeable);
WARN_ON(inode->i_state & I_NEW);
inode->i_state |= I_FREEING;
- nr_pruned++;
+
+ /*
+ * Move the inode off the IO lists and LRU once I_FREEING is
+ * set so that it won't get moved back on there if it is dirty.
+ */
+ list_move(&inode->i_lru, &freeable);
+ list_del_init(&inode->i_wb_list);
+ percpu_counter_dec(&nr_inodes_unused);
}
- inodes_stat.nr_unused -= nr_pruned;
if (current_is_kswapd())
__count_vm_events(KSWAPD_INODESTEAL, reap);
else
@@ -519,7 +683,7 @@
return -1;
prune_icache(nr);
}
- return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
+ return (get_nr_inodes_unused() / 100) * sysctl_vfs_cache_pressure;
}
static struct shrinker icache_shrinker = {
@@ -530,9 +694,6 @@
static void __wait_on_freeing_inode(struct inode *inode);
/*
* Called with the inode lock held.
- * NOTE: we are not increasing the inode-refcount, you must call __iget()
- * by hand after calling find_inode now! This simplifies iunique and won't
- * add any additional branch in the common code.
*/
static struct inode *find_inode(struct super_block *sb,
struct hlist_head *head,
@@ -552,9 +713,10 @@
__wait_on_freeing_inode(inode);
goto repeat;
}
- break;
+ __iget(inode);
+ return inode;
}
- return node ? inode : NULL;
+ return NULL;
}
/*
@@ -577,53 +739,49 @@
__wait_on_freeing_inode(inode);
goto repeat;
}
- break;
+ __iget(inode);
+ return inode;
}
- return node ? inode : NULL;
+ return NULL;
}
-static unsigned long hash(struct super_block *sb, unsigned long hashval)
-{
- unsigned long tmp;
-
- tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
- L1_CACHE_BYTES;
- tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
- return tmp & I_HASHMASK;
-}
-
-static inline void
-__inode_add_to_lists(struct super_block *sb, struct hlist_head *head,
- struct inode *inode)
-{
- inodes_stat.nr_inodes++;
- list_add(&inode->i_list, &inode_in_use);
- list_add(&inode->i_sb_list, &sb->s_inodes);
- if (head)
- hlist_add_head(&inode->i_hash, head);
-}
-
-/**
- * inode_add_to_lists - add a new inode to relevant lists
- * @sb: superblock inode belongs to
- * @inode: inode to mark in use
+/*
+ * Each cpu owns a range of LAST_INO_BATCH numbers.
+ * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations,
+ * to renew the exhausted range.
*
- * When an inode is allocated it needs to be accounted for, added to the in use
- * list, the owning superblock and the inode hash. This needs to be done under
- * the inode_lock, so export a function to do this rather than the inode lock
- * itself. We calculate the hash list to add to here so it is all internal
- * which requires the caller to have already set up the inode number in the
- * inode to add.
+ * This does not significantly increase overflow rate because every CPU can
+ * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is
+ * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the
+ * 2^32 range, and is a worst-case. Even a 50% wastage would only increase
+ * overflow rate by 2x, which does not seem too significant.
+ *
+ * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
+ * error if st_ino won't fit in target struct field. Use 32bit counter
+ * here to attempt to avoid that.
*/
-void inode_add_to_lists(struct super_block *sb, struct inode *inode)
-{
- struct hlist_head *head = inode_hashtable + hash(sb, inode->i_ino);
+#define LAST_INO_BATCH 1024
+static DEFINE_PER_CPU(unsigned int, last_ino);
- spin_lock(&inode_lock);
- __inode_add_to_lists(sb, head, inode);
- spin_unlock(&inode_lock);
+unsigned int get_next_ino(void)
+{
+ unsigned int *p = &get_cpu_var(last_ino);
+ unsigned int res = *p;
+
+#ifdef CONFIG_SMP
+ if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) {
+ static atomic_t shared_last_ino;
+ int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino);
+
+ res = next - LAST_INO_BATCH;
+ }
+#endif
+
+ *p = ++res;
+ put_cpu_var(last_ino);
+ return res;
}
-EXPORT_SYMBOL_GPL(inode_add_to_lists);
+EXPORT_SYMBOL(get_next_ino);
/**
* new_inode - obtain an inode
@@ -639,12 +797,6 @@
*/
struct inode *new_inode(struct super_block *sb)
{
- /*
- * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
- * error if st_ino won't fit in target struct field. Use 32bit counter
- * here to attempt to avoid that.
- */
- static unsigned int last_ino;
struct inode *inode;
spin_lock_prefetch(&inode_lock);
@@ -652,8 +804,7 @@
inode = alloc_inode(sb);
if (inode) {
spin_lock(&inode_lock);
- __inode_add_to_lists(sb, NULL, inode);
- inode->i_ino = ++last_ino;
+ __inode_sb_list_add(inode);
inode->i_state = 0;
spin_unlock(&inode_lock);
}
@@ -664,7 +815,7 @@
void unlock_new_inode(struct inode *inode)
{
#ifdef CONFIG_DEBUG_LOCK_ALLOC
- if (inode->i_mode & S_IFDIR) {
+ if (S_ISDIR(inode->i_mode)) {
struct file_system_type *type = inode->i_sb->s_type;
/* Set new key only if filesystem hasn't already changed it */
@@ -721,7 +872,8 @@
if (set(inode, data))
goto set_failed;
- __inode_add_to_lists(sb, head, inode);
+ hlist_add_head(&inode->i_hash, head);
+ __inode_sb_list_add(inode);
inode->i_state = I_NEW;
spin_unlock(&inode_lock);
@@ -736,7 +888,6 @@
* us. Use the old inode instead of the one we just
* allocated.
*/
- __iget(old);
spin_unlock(&inode_lock);
destroy_inode(inode);
inode = old;
@@ -768,7 +919,8 @@
old = find_inode_fast(sb, head, ino);
if (!old) {
inode->i_ino = ino;
- __inode_add_to_lists(sb, head, inode);
+ hlist_add_head(&inode->i_hash, head);
+ __inode_sb_list_add(inode);
inode->i_state = I_NEW;
spin_unlock(&inode_lock);
@@ -783,7 +935,6 @@
* us. Use the old inode instead of the one we just
* allocated.
*/
- __iget(old);
spin_unlock(&inode_lock);
destroy_inode(inode);
inode = old;
@@ -792,6 +943,27 @@
return inode;
}
+/*
+ * search the inode cache for a matching inode number.
+ * If we find one, then the inode number we are trying to
+ * allocate is not unique and so we should not use it.
+ *
+ * Returns 1 if the inode number is unique, 0 if it is not.
+ */
+static int test_inode_iunique(struct super_block *sb, unsigned long ino)
+{
+ struct hlist_head *b = inode_hashtable + hash(sb, ino);
+ struct hlist_node *node;
+ struct inode *inode;
+
+ hlist_for_each_entry(inode, node, b, i_hash) {
+ if (inode->i_ino == ino && inode->i_sb == sb)
+ return 0;
+ }
+
+ return 1;
+}
+
/**
* iunique - get a unique inode number
* @sb: superblock
@@ -813,19 +985,18 @@
* error if st_ino won't fit in target struct field. Use 32bit counter
* here to attempt to avoid that.
*/
+ static DEFINE_SPINLOCK(iunique_lock);
static unsigned int counter;
- struct inode *inode;
- struct hlist_head *head;
ino_t res;
spin_lock(&inode_lock);
+ spin_lock(&iunique_lock);
do {
if (counter <= max_reserved)
counter = max_reserved + 1;
res = counter++;
- head = inode_hashtable + hash(sb, res);
- inode = find_inode_fast(sb, head, res);
- } while (inode != NULL);
+ } while (!test_inode_iunique(sb, res));
+ spin_unlock(&iunique_lock);
spin_unlock(&inode_lock);
return res;
@@ -877,7 +1048,6 @@
spin_lock(&inode_lock);
inode = find_inode(sb, head, test, data);
if (inode) {
- __iget(inode);
spin_unlock(&inode_lock);
if (likely(wait))
wait_on_inode(inode);
@@ -910,7 +1080,6 @@
spin_lock(&inode_lock);
inode = find_inode_fast(sb, head, ino);
if (inode) {
- __iget(inode);
spin_unlock(&inode_lock);
wait_on_inode(inode);
return inode;
@@ -1096,7 +1265,7 @@
__iget(old);
spin_unlock(&inode_lock);
wait_on_inode(old);
- if (unlikely(!hlist_unhashed(&old->i_hash))) {
+ if (unlikely(!inode_unhashed(old))) {
iput(old);
return -EBUSY;
}
@@ -1135,7 +1304,7 @@
__iget(old);
spin_unlock(&inode_lock);
wait_on_inode(old);
- if (unlikely(!hlist_unhashed(&old->i_hash))) {
+ if (unlikely(!inode_unhashed(old))) {
iput(old);
return -EBUSY;
}
@@ -1144,36 +1313,6 @@
}
EXPORT_SYMBOL(insert_inode_locked4);
-/**
- * __insert_inode_hash - hash an inode
- * @inode: unhashed inode
- * @hashval: unsigned long value used to locate this object in the
- * inode_hashtable.
- *
- * Add an inode to the inode hash for this superblock.
- */
-void __insert_inode_hash(struct inode *inode, unsigned long hashval)
-{
- struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
- spin_lock(&inode_lock);
- hlist_add_head(&inode->i_hash, head);
- spin_unlock(&inode_lock);
-}
-EXPORT_SYMBOL(__insert_inode_hash);
-
-/**
- * remove_inode_hash - remove an inode from the hash
- * @inode: inode to unhash
- *
- * Remove an inode from the superblock.
- */
-void remove_inode_hash(struct inode *inode)
-{
- spin_lock(&inode_lock);
- hlist_del_init(&inode->i_hash);
- spin_unlock(&inode_lock);
-}
-EXPORT_SYMBOL(remove_inode_hash);
int generic_delete_inode(struct inode *inode)
{
@@ -1188,7 +1327,7 @@
*/
int generic_drop_inode(struct inode *inode)
{
- return !inode->i_nlink || hlist_unhashed(&inode->i_hash);
+ return !inode->i_nlink || inode_unhashed(inode);
}
EXPORT_SYMBOL_GPL(generic_drop_inode);
@@ -1214,10 +1353,11 @@
drop = generic_drop_inode(inode);
if (!drop) {
- if (!(inode->i_state & (I_DIRTY|I_SYNC)))
- list_move(&inode->i_list, &inode_unused);
- inodes_stat.nr_unused++;
if (sb->s_flags & MS_ACTIVE) {
+ inode->i_state |= I_REFERENCED;
+ if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
+ inode_lru_list_add(inode);
+ }
spin_unlock(&inode_lock);
return;
}
@@ -1228,19 +1368,23 @@
spin_lock(&inode_lock);
WARN_ON(inode->i_state & I_NEW);
inode->i_state &= ~I_WILL_FREE;
- inodes_stat.nr_unused--;
- hlist_del_init(&inode->i_hash);
+ __remove_inode_hash(inode);
}
- list_del_init(&inode->i_list);
- list_del_init(&inode->i_sb_list);
+
WARN_ON(inode->i_state & I_NEW);
inode->i_state |= I_FREEING;
- inodes_stat.nr_inodes--;
+
+ /*
+ * Move the inode off the IO lists and LRU once I_FREEING is
+ * set so that it won't get moved back on there if it is dirty.
+ */
+ inode_lru_list_del(inode);
+ list_del_init(&inode->i_wb_list);
+
+ __inode_sb_list_del(inode);
spin_unlock(&inode_lock);
evict(inode);
- spin_lock(&inode_lock);
- hlist_del_init(&inode->i_hash);
- spin_unlock(&inode_lock);
+ remove_inode_hash(inode);
wake_up_inode(inode);
BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
destroy_inode(inode);
@@ -1504,6 +1648,8 @@
SLAB_MEM_SPREAD),
init_once);
register_shrinker(&icache_shrinker);
+ percpu_counter_init(&nr_inodes, 0);
+ percpu_counter_init(&nr_inodes_unused, 0);
/* Hash may have been set up in inode_init_early */
if (!hashdist)
diff --git a/fs/internal.h b/fs/internal.h
index a6910e9..ebad3b9 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -101,3 +101,10 @@
struct nameidata;
extern struct file *nameidata_to_filp(struct nameidata *);
extern void release_open_intent(struct nameidata *);
+
+/*
+ * inode.c
+ */
+extern int get_nr_dirty_inodes(void);
+extern int evict_inodes(struct super_block *);
+extern int invalidate_inodes(struct super_block *);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 09ff41a..60c2b944 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -962,25 +962,23 @@
* or getblk() if they are not. Returns the number of blocks inserted
* (-ve == error.)
*/
-int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
+int isofs_get_blocks(struct inode *inode, sector_t iblock,
struct buffer_head **bh, unsigned long nblocks)
{
- unsigned long b_off;
+ unsigned long b_off = iblock;
unsigned offset, sect_size;
unsigned int firstext;
unsigned long nextblk, nextoff;
- long iblock = (long)iblock_s;
int section, rv, error;
struct iso_inode_info *ei = ISOFS_I(inode);
error = -EIO;
rv = 0;
- if (iblock < 0 || iblock != iblock_s) {
+ if (iblock != b_off) {
printk(KERN_DEBUG "%s: block number too large\n", __func__);
goto abort;
}
- b_off = iblock;
offset = 0;
firstext = ei->i_first_extent;
@@ -998,8 +996,9 @@
* I/O errors.
*/
if (b_off > ((inode->i_size + PAGE_CACHE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) {
- printk(KERN_DEBUG "%s: block >= EOF (%ld, %ld)\n",
- __func__, iblock, (unsigned long) inode->i_size);
+ printk(KERN_DEBUG "%s: block >= EOF (%lu, %llu)\n",
+ __func__, b_off,
+ (unsigned long long)inode->i_size);
goto abort;
}
@@ -1025,9 +1024,9 @@
if (++section > 100) {
printk(KERN_DEBUG "%s: More than 100 file sections ?!?"
" aborting...\n", __func__);
- printk(KERN_DEBUG "%s: block=%ld firstext=%u sect_size=%u "
+ printk(KERN_DEBUG "%s: block=%lu firstext=%u sect_size=%u "
"nextblk=%lu nextoff=%lu\n", __func__,
- iblock, firstext, (unsigned) sect_size,
+ b_off, firstext, (unsigned) sect_size,
nextblk, nextoff);
goto abort;
}
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index ed78a3cf3..79121aa 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -289,7 +289,7 @@
mutex_unlock(&f->sem);
d_instantiate(dentry, old_dentry->d_inode);
dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
- atomic_inc(&old_dentry->d_inode->i_count);
+ ihold(old_dentry->d_inode);
}
return ret;
}
@@ -864,7 +864,7 @@
printk(KERN_NOTICE "jffs2_rename(): Link succeeded, unlink failed (err %d). You now have a hard link\n", ret);
/* Might as well let the VFS know */
d_instantiate(new_dentry, old_dentry->d_inode);
- atomic_inc(&old_dentry->d_inode->i_count);
+ ihold(old_dentry->d_inode);
new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now);
return ret;
}
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index f8332dc..3a09423 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -497,7 +497,7 @@
* appear hashed, but do not put on any lists. hlist_del()
* will work fine and require no locking.
*/
- ip->i_hash.pprev = &ip->i_hash.next;
+ hlist_add_fake(&ip->i_hash);
return (ip);
}
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index d945ea7..9466957 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -1279,7 +1279,7 @@
* lazy commit thread finishes processing
*/
if (tblk->xflag & COMMIT_DELETE) {
- atomic_inc(&tblk->u.ip->i_count);
+ ihold(tblk->u.ip);
/*
* Avoid a rare deadlock
*
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index a9cf8e8..231ca4a 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -839,7 +839,7 @@
ip->i_ctime = CURRENT_TIME;
dir->i_ctime = dir->i_mtime = CURRENT_TIME;
mark_inode_dirty(dir);
- atomic_inc(&ip->i_count);
+ ihold(ip);
iplist[0] = ip;
iplist[1] = dir;
diff --git a/fs/libfs.c b/fs/libfs.c
index 62baa03..304a513 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -255,7 +255,7 @@
inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
inc_nlink(inode);
- atomic_inc(&inode->i_count);
+ ihold(inode);
dget(dentry);
d_instantiate(dentry, inode);
return 0;
@@ -892,10 +892,6 @@
*/
int generic_file_fsync(struct file *file, int datasync)
{
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = 0, /* metadata-only; caller takes care of data */
- };
struct inode *inode = file->f_mapping->host;
int err;
int ret;
@@ -906,7 +902,7 @@
if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
return ret;
- err = sync_inode(inode, &wbc);
+ err = sync_inode_metadata(inode, 1);
if (ret == 0)
ret = err;
return ret;
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 1eb4e89..409dfd6 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -569,7 +569,7 @@
return -EMLINK;
inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
- atomic_inc(&inode->i_count);
+ ihold(inode);
inode->i_nlink++;
mark_inode_dirty_sync(inode);
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index f3f3578..c0d35a3 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -101,7 +101,7 @@
inode->i_ctime = CURRENT_TIME_SEC;
inode_inc_link_count(inode);
- atomic_inc(&inode->i_count);
+ ihold(inode);
return add_nondir(dentry, inode);
}
diff --git a/fs/namei.c b/fs/namei.c
index 24896e8..f7dbc06 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1121,11 +1121,13 @@
static struct dentry *__lookup_hash(struct qstr *name,
struct dentry *base, struct nameidata *nd)
{
+ struct inode *inode = base->d_inode;
struct dentry *dentry;
- struct inode *inode;
int err;
- inode = base->d_inode;
+ err = exec_permission(inode);
+ if (err)
+ return ERR_PTR(err);
/*
* See if the low-level filesystem might want
@@ -1161,11 +1163,6 @@
*/
static struct dentry *lookup_hash(struct nameidata *nd)
{
- int err;
-
- err = exec_permission(nd->path.dentry->d_inode);
- if (err)
- return ERR_PTR(err);
return __lookup_hash(&nd->last, nd->path.dentry, nd);
}
@@ -1213,9 +1210,6 @@
if (err)
return ERR_PTR(err);
- err = exec_permission(base->d_inode);
- if (err)
- return ERR_PTR(err);
return __lookup_hash(&this, base, NULL);
}
@@ -2291,7 +2285,7 @@
goto slashes;
inode = dentry->d_inode;
if (inode)
- atomic_inc(&inode->i_count);
+ ihold(inode);
error = mnt_want_write(nd.path.mnt);
if (error)
goto exit2;
diff --git a/fs/namespace.c b/fs/namespace.c
index 7ca5182..8a415c9 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -595,7 +595,7 @@
goto out_free;
}
- mnt->mnt_flags = old->mnt_flags;
+ mnt->mnt_flags = old->mnt_flags & ~MNT_WRITE_HOLD;
atomic_inc(&sb->s_active);
mnt->mnt_sb = sb;
mnt->mnt_root = dget(root);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 257e405..07ac384 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1801,7 +1801,7 @@
d_drop(dentry);
error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
if (error == 0) {
- atomic_inc(&inode->i_count);
+ ihold(inode);
d_add(dentry, inode);
}
return error;
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index a70e446..ac7b814 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -54,8 +54,7 @@
iput(inode);
return -ENOMEM;
}
- /* Circumvent igrab(): we know the inode is not being freed */
- atomic_inc(&inode->i_count);
+ ihold(inode);
/*
* Ensure that this dentry is invisible to d_find_alias().
* Otherwise, it may be spliced into the tree by
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 661a6cf..184938f 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -281,23 +281,13 @@
{
struct inode *inode = fhp->fh_dentry->d_inode;
const struct export_operations *export_ops = inode->i_sb->s_export_op;
- int error = 0;
if (!EX_ISSYNC(fhp->fh_export))
return 0;
- if (export_ops->commit_metadata) {
- error = export_ops->commit_metadata(inode);
- } else {
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = 0, /* metadata only */
- };
-
- error = sync_inode(inode, &wbc);
- }
-
- return error;
+ if (export_ops->commit_metadata)
+ return export_ops->commit_metadata(inode);
+ return sync_inode_metadata(inode, 1);
}
/*
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 185d160..6e9557e 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -207,7 +207,7 @@
inode->i_ctime = CURRENT_TIME;
inode_inc_link_count(inode);
- atomic_inc(&inode->i_count);
+ ihold(inode);
err = nilfs_add_nondir(dentry, inode);
if (!err)
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 3680242..4498a20 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -88,8 +88,6 @@
{
struct dentry *parent;
struct inode *p_inode;
- bool send = false;
- bool should_update_children = false;
if (!dentry)
dentry = path->dentry;
@@ -97,29 +95,12 @@
if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
return;
- spin_lock(&dentry->d_lock);
- parent = dentry->d_parent;
+ parent = dget_parent(dentry);
p_inode = parent->d_inode;
- if (fsnotify_inode_watches_children(p_inode)) {
- if (p_inode->i_fsnotify_mask & mask) {
- dget(parent);
- send = true;
- }
- } else {
- /*
- * The parent doesn't care about events on it's children but
- * at least one child thought it did. We need to run all the
- * children and update their d_flags to let them know p_inode
- * doesn't care about them any more.
- */
- dget(parent);
- should_update_children = true;
- }
-
- spin_unlock(&dentry->d_lock);
-
- if (send) {
+ if (unlikely(!fsnotify_inode_watches_children(p_inode)))
+ __fsnotify_update_child_dentry_flags(p_inode);
+ else if (p_inode->i_fsnotify_mask & mask) {
/* we are notifying a parent so come up with the new mask which
* specifies these are events which came from a child. */
mask |= FS_EVENT_ON_CHILD;
@@ -130,13 +111,9 @@
else
fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
dentry->d_name.name, 0);
- dput(parent);
}
- if (unlikely(should_update_children)) {
- __fsnotify_update_child_dentry_flags(p_inode);
- dput(parent);
- }
+ dput(parent);
}
EXPORT_SYMBOL_GPL(__fsnotify_parent);
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 33297c0..21ed106 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -240,6 +240,7 @@
{
struct inode *inode, *next_i, *need_iput = NULL;
+ spin_lock(&inode_lock);
list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
struct inode *need_iput_tmp;
@@ -297,4 +298,5 @@
spin_lock(&inode_lock);
}
+ spin_unlock(&inode_lock);
}
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 19c5180..d3fbe57 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2911,8 +2911,8 @@
goto unl_upcase_iput_tmp_ino_err_out_now;
}
if ((sb->s_root = d_alloc_root(vol->root_ino))) {
- /* We increment i_count simulating an ntfs_iget(). */
- atomic_inc(&vol->root_ino->i_count);
+ /* We grab a reference, simulating an ntfs_iget(). */
+ ihold(vol->root_ino);
ntfs_debug("Exiting, status successful.");
/* Release the default upcase if it has no users. */
mutex_lock(&ntfs_lock);
@@ -3021,21 +3021,6 @@
if (vol->mft_ino && vol->mft_ino != tmp_ino)
iput(vol->mft_ino);
vol->mft_ino = NULL;
- /*
- * This is needed to get ntfs_clear_extent_inode() called for each
- * inode we have ever called ntfs_iget()/iput() on, otherwise we A)
- * leak resources and B) a subsequent mount fails automatically due to
- * ntfs_iget() never calling down into our ntfs_read_locked_inode()
- * method again... FIXME: Do we need to do this twice now because of
- * attribute inodes? I think not, so leave as is for now... (AIA)
- */
- if (invalidate_inodes(sb)) {
- ntfs_error(sb, "Busy inodes left. This is most likely a NTFS "
- "driver bug.");
- /* Copied from fs/super.c. I just love this message. (-; */
- printk("NTFS: Busy inodes after umount. Self-destruct in 5 "
- "seconds. Have a nice day...\n");
- }
/* Errors at this stage are irrelevant. */
err_out_now:
sb->s_fs_info = NULL;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 5cfeee1..f1e962c 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -165,7 +165,7 @@
* ocfs2 never allocates in this function - the only time we
* need to use BH_New is when we're extending i_size on a file
* system which doesn't support holes, in which case BH_New
- * allows block_prepare_write() to zero.
+ * allows __block_write_begin() to zero.
*
* If we see this on a sparse file system, then a truncate has
* raced us and removed the cluster. In this case, we clear
@@ -407,21 +407,6 @@
return ret;
}
-/*
- * This is called from ocfs2_write_zero_page() which has handled it's
- * own cluster locking and has ensured allocation exists for those
- * blocks to be written.
- */
-int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
- unsigned from, unsigned to)
-{
- int ret;
-
- ret = block_prepare_write(page, from, to, ocfs2_get_block);
-
- return ret;
-}
-
/* Taken from ext3. We don't necessarily need the full blown
* functionality yet, but IMHO it's better to cut and paste the whole
* thing so we can avoid introducing our own bugs (and easily pick up
@@ -732,7 +717,7 @@
}
/*
- * Some of this taken from block_prepare_write(). We already have our
+ * Some of this taken from __block_write_begin(). We already have our
* mapping by now though, and the entire write will be allocating or
* it won't, so not much need to use BH_New.
*
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 7606f66..76bfdfd 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -22,9 +22,6 @@
#ifndef OCFS2_AOPS_H
#define OCFS2_AOPS_H
-int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
- unsigned from, unsigned to);
-
handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
struct page *page,
unsigned from,
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index a7ebd9d..75e115f 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -400,6 +400,7 @@
if (inode) {
ip = DLMFS_I(inode);
+ inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
@@ -425,6 +426,7 @@
if (!inode)
return NULL;
+ inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 1ca6867..77b4c04 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -796,13 +796,12 @@
block_end = block_start + (1 << inode->i_blkbits);
/*
- * block_start is block-aligned. Bump it by one to
- * force ocfs2_{prepare,commit}_write() to zero the
+ * block_start is block-aligned. Bump it by one to force
+ * __block_write_begin and block_commit_write to zero the
* whole block.
*/
- ret = ocfs2_prepare_write_nolock(inode, page,
- block_start + 1,
- block_start + 1);
+ ret = __block_write_begin(page, block_start + 1, 0,
+ ocfs2_get_block);
if (ret < 0) {
mlog_errno(ret);
goto out_unlock;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index e7bde21..ff5744e 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -742,7 +742,7 @@
goto out_commit;
}
- atomic_inc(&inode->i_count);
+ ihold(inode);
dentry->d_op = &ocfs2_dentry_ops;
d_instantiate(dentry, inode);
diff --git a/fs/pipe.c b/fs/pipe.c
index 37eb1eb..d2d7566 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -954,6 +954,8 @@
if (!inode)
goto fail_inode;
+ inode->i_ino = get_next_ino();
+
pipe = alloc_pipe_info(inode);
if (!pipe)
goto fail_iput;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 53dc8ad..9b094c1 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -771,6 +771,8 @@
static int mem_open(struct inode* inode, struct file* file)
{
file->private_data = (void*)((long)current->self_exec_id);
+ /* OK to pass negative loff_t, we can catch out-of-range */
+ file->f_mode |= FMODE_UNSIGNED_OFFSET;
return 0;
}
@@ -1646,6 +1648,7 @@
/* Common stuff */
ei = PROC_I(inode);
+ inode->i_ino = get_next_ino();
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
inode->i_op = &proc_def_inode_operations;
@@ -2592,6 +2595,7 @@
/* Initialize the inode */
ei = PROC_I(inode);
+ inode->i_ino = get_next_ino();
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
/*
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 2fc5255..b652cb0 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -23,6 +23,8 @@
if (!inode)
goto out;
+ inode->i_ino = get_next_ino();
+
sysctl_head_get(head);
ei = PROC_I(inode);
ei->sysctl = head;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a5ebae7..67fadb1 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -58,6 +58,7 @@
struct inode * inode = new_inode(sb);
if (inode) {
+ inode->i_ino = get_next_ino();
inode_init_owner(inode, dir, mode);
inode->i_mapping->a_ops = &ramfs_aops;
inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
diff --git a/fs/read_write.c b/fs/read_write.c
index e757ef2..9cd9d14 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -31,6 +31,20 @@
EXPORT_SYMBOL(generic_ro_fops);
+static int
+__negative_fpos_check(struct file *file, loff_t pos, size_t count)
+{
+ /*
+ * pos or pos+count is negative here, check overflow.
+ * too big "count" will be caught in rw_verify_area().
+ */
+ if ((pos < 0) && (pos + count < pos))
+ return -EOVERFLOW;
+ if (file->f_mode & FMODE_UNSIGNED_OFFSET)
+ return 0;
+ return -EINVAL;
+}
+
/**
* generic_file_llseek_unlocked - lockless generic llseek implementation
* @file: file structure to seek on
@@ -62,7 +76,9 @@
break;
}
- if (offset < 0 || offset > inode->i_sb->s_maxbytes)
+ if (offset < 0 && __negative_fpos_check(file, offset, 0))
+ return -EINVAL;
+ if (offset > inode->i_sb->s_maxbytes)
return -EINVAL;
/* Special lock needed here? */
@@ -137,7 +153,7 @@
offset += file->f_pos;
}
retval = -EINVAL;
- if (offset >= 0) {
+ if (offset >= 0 || !__negative_fpos_check(file, offset, 0)) {
if (offset != file->f_pos) {
file->f_pos = offset;
file->f_version = 0;
@@ -221,6 +237,7 @@
}
#endif
+
/*
* rw_verify_area doesn't like huge counts. We limit
* them to something that fits in "int" so that others
@@ -238,8 +255,11 @@
if (unlikely((ssize_t) count < 0))
return retval;
pos = *ppos;
- if (unlikely((pos < 0) || (loff_t) (pos + count) < 0))
- return retval;
+ if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) {
+ retval = __negative_fpos_check(file, pos, count);
+ if (retval)
+ return retval;
+ }
if (unlikely(inode->i_flock && mandatory_lock(inode))) {
retval = locks_mandatory_area(
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index c1f9389..41656d4 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -22,8 +22,6 @@
int reiserfs_commit_write(struct file *f, struct page *page,
unsigned from, unsigned to);
-int reiserfs_prepare_write(struct file *f, struct page *page,
- unsigned from, unsigned to);
void reiserfs_evict_inode(struct inode *inode)
{
@@ -165,7 +163,7 @@
** but tail is still sitting in a direct item, and we can't write to
** it. So, look through this page, and check all the mapped buffers
** to make sure they have valid block numbers. Any that don't need
-** to be unmapped, so that block_prepare_write will correctly call
+** to be unmapped, so that __block_write_begin will correctly call
** reiserfs_get_block to convert the tail into an unformatted node
*/
static inline void fix_tail_page_for_writing(struct page *page)
@@ -439,13 +437,13 @@
}
/* special version of get_block that is only used by grab_tail_page right
-** now. It is sent to block_prepare_write, and when you try to get a
+** now. It is sent to __block_write_begin, and when you try to get a
** block past the end of the file (or a block from a hole) it returns
-** -ENOENT instead of a valid buffer. block_prepare_write expects to
+** -ENOENT instead of a valid buffer. __block_write_begin expects to
** be able to do i/o on the buffers returned, unless an error value
** is also returned.
**
-** So, this allows block_prepare_write to be used for reading a single block
+** So, this allows __block_write_begin to be used for reading a single block
** in a page. Where it does not produce a valid page for holes, or past the
** end of the file. This turns out to be exactly what we need for reading
** tails for conversion.
@@ -558,11 +556,12 @@
**
** We must fix the tail page for writing because it might have buffers
** that are mapped, but have a block number of 0. This indicates tail
- ** data that has been read directly into the page, and block_prepare_write
- ** won't trigger a get_block in this case.
+ ** data that has been read directly into the page, and
+ ** __block_write_begin won't trigger a get_block in this case.
*/
fix_tail_page_for_writing(tail_page);
- retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end);
+ retval = __reiserfs_write_begin(tail_page, tail_start,
+ tail_end - tail_start);
if (retval)
goto unlock;
@@ -2033,7 +2032,7 @@
/* start within the page of the last block in the file */
start = (offset / blocksize) * blocksize;
- error = block_prepare_write(page, start, offset,
+ error = __block_write_begin(page, start, offset - start,
reiserfs_get_block_create_0);
if (error)
goto unlock;
@@ -2628,8 +2627,7 @@
return ret;
}
-int reiserfs_prepare_write(struct file *f, struct page *page,
- unsigned from, unsigned to)
+int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len)
{
struct inode *inode = page->mapping->host;
int ret;
@@ -2650,7 +2648,7 @@
th->t_refcount++;
}
- ret = block_prepare_write(page, from, to, reiserfs_get_block);
+ ret = __block_write_begin(page, from, len, reiserfs_get_block);
if (ret && reiserfs_transaction_running(inode->i_sb)) {
struct reiserfs_transaction_handle *th = current->journal_info;
/* this gets a little ugly. If reiserfs_get_block returned an
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 5cbb81e..adf22b4 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -160,8 +160,6 @@
int reiserfs_commit_write(struct file *f, struct page *page,
unsigned from, unsigned to);
-int reiserfs_prepare_write(struct file *f, struct page *page,
- unsigned from, unsigned to);
/*
** reiserfs_unpack
** Function try to convert tail from direct item into indirect.
@@ -200,7 +198,7 @@
}
/* we unpack by finding the page with the tail, and calling
- ** reiserfs_prepare_write on that page. This will force a
+ ** __reiserfs_write_begin on that page. This will force a
** reiserfs_get_block to unpack the tail for us.
*/
index = inode->i_size >> PAGE_CACHE_SHIFT;
@@ -210,7 +208,7 @@
if (!page) {
goto out;
}
- retval = reiserfs_prepare_write(NULL, page, write_from, write_from);
+ retval = __reiserfs_write_begin(page, write_from, 0);
if (retval)
goto out_unlock;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index ee78d4a..ba5f51e 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -1156,7 +1156,7 @@
inode->i_ctime = CURRENT_TIME_SEC;
reiserfs_update_sd(&th, inode);
- atomic_inc(&inode->i_count);
+ ihold(inode);
d_instantiate(dentry, inode);
retval = journal_end(&th, dir->i_sb, jbegin_count);
reiserfs_write_unlock(dir->i_sb);
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 8c4cf27..5d04a78 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -418,13 +418,11 @@
int reiserfs_commit_write(struct file *f, struct page *page,
unsigned from, unsigned to);
-int reiserfs_prepare_write(struct file *f, struct page *page,
- unsigned from, unsigned to);
static void update_ctime(struct inode *inode)
{
struct timespec now = current_fs_time(inode->i_sb);
- if (hlist_unhashed(&inode->i_hash) || !inode->i_nlink ||
+ if (inode_unhashed(inode) || !inode->i_nlink ||
timespec_equal(&inode->i_ctime, &now))
return;
@@ -532,8 +530,7 @@
rxh->h_hash = cpu_to_le32(xahash);
}
- err = reiserfs_prepare_write(NULL, page, page_offset,
- page_offset + chunk + skip);
+ err = __reiserfs_write_begin(page, page_offset, chunk + skip);
if (!err) {
if (buffer)
memcpy(data + skip, buffer + buffer_pos, chunk);
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 0e7cb13..05d6b0e 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -462,9 +462,7 @@
if (size) {
char *p;
- spin_lock(&dcache_lock);
p = __d_path(path, root, buf, size);
- spin_unlock(&dcache_lock);
res = PTR_ERR(p);
if (!IS_ERR(p)) {
char *end = mangle_path(buf, p, esc);
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index 00a70ca..f678d42 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -406,21 +406,15 @@
smb_renew_times(struct dentry * dentry)
{
dget(dentry);
- spin_lock(&dentry->d_lock);
- for (;;) {
- struct dentry *parent;
+ dentry->d_time = jiffies;
- dentry->d_time = jiffies;
- if (IS_ROOT(dentry))
- break;
- parent = dentry->d_parent;
- dget(parent);
- spin_unlock(&dentry->d_lock);
+ while (!IS_ROOT(dentry)) {
+ struct dentry *parent = dget_parent(dentry);
dput(dentry);
dentry = parent;
- spin_lock(&dentry->d_lock);
+
+ dentry->d_time = jiffies;
}
- spin_unlock(&dentry->d_lock);
dput(dentry);
}
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 8fc5e50..f6e9ee5 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -229,7 +229,6 @@
{
VERBOSE("\n");
shrink_dcache_sb(SB_of(server));
- invalidate_inodes(SB_of(server));
}
/*
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
index 71c29b6..3dcf638 100644
--- a/fs/smbfs/proc.c
+++ b/fs/smbfs/proc.c
@@ -332,16 +332,15 @@
* and store it in reversed order [see reverse_string()]
*/
dget(entry);
- spin_lock(&entry->d_lock);
while (!IS_ROOT(entry)) {
struct dentry *parent;
if (maxlen < (3<<unicode)) {
- spin_unlock(&entry->d_lock);
dput(entry);
return -ENAMETOOLONG;
}
+ spin_lock(&entry->d_lock);
len = server->ops->convert(path, maxlen-2,
entry->d_name.name, entry->d_name.len,
server->local_nls, server->remote_nls);
@@ -359,15 +358,12 @@
}
*path++ = '\\';
maxlen -= len+1;
-
- parent = entry->d_parent;
- dget(parent);
spin_unlock(&entry->d_lock);
+
+ parent = dget_parent(entry);
dput(entry);
entry = parent;
- spin_lock(&entry->d_lock);
}
- spin_unlock(&entry->d_lock);
dput(entry);
reverse_string(buf, path-buf);
diff --git a/fs/super.c b/fs/super.c
index 8819e3a..b9c9869 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -273,14 +273,14 @@
get_fs_excl();
sb->s_flags &= ~MS_ACTIVE;
- /* bad name - it should be evict_inodes() */
- invalidate_inodes(sb);
+ fsnotify_unmount_inodes(&sb->s_inodes);
+
+ evict_inodes(sb);
if (sop->put_super)
sop->put_super(sb);
- /* Forget any remaining inodes */
- if (invalidate_inodes(sb)) {
+ if (!list_empty(&sb->s_inodes)) {
printk("VFS: Busy inodes after unmount of %s. "
"Self-destruct in 5 seconds. Have a nice day...\n",
sb->s_id);
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 33e047b..11e7f7d 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -126,7 +126,7 @@
inode->i_ctime = CURRENT_TIME_SEC;
inode_inc_link_count(inode);
- atomic_inc(&inode->i_count);
+ ihold(inode);
return add_nondir(dentry, inode);
}
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 87ebcce..14f64b6 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -550,7 +550,7 @@
lock_2_inodes(dir, inode);
inc_nlink(inode);
- atomic_inc(&inode->i_count);
+ ihold(inode);
inode->i_ctime = ubifs_current_time(inode);
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index bf5fc67..6d8dc02 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -1101,7 +1101,7 @@
inc_nlink(inode);
inode->i_ctime = current_fs_time(inode->i_sb);
mark_inode_dirty(inode);
- atomic_inc(&inode->i_count);
+ ihold(inode);
d_instantiate(dentry, inode);
unlock_kernel();
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index b056f02..12f39b9 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -180,7 +180,7 @@
inode->i_ctime = CURRENT_TIME_SEC;
inode_inc_link_count(inode);
- atomic_inc(&inode->i_count);
+ ihold(inode);
error = ufs_add_nondir(dentry, inode);
unlock_kernel();
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index ba53128..63fd2c0 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1580,6 +1580,7 @@
XFS_BUFTARG_NAME(btp));
return ENOMEM;
}
+ inode->i_ino = get_next_ino();
inode->i_mode = S_IFBLK;
inode->i_bdev = bdev;
inode->i_rdev = bdev->bd_dev;
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index ec858e0..96107ef 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -317,7 +317,7 @@
if (unlikely(error))
return -error;
- atomic_inc(&inode->i_count);
+ ihold(inode);
d_instantiate(dentry, inode);
return 0;
}
@@ -760,7 +760,9 @@
inode->i_ino = ip->i_ino;
inode->i_state = I_NEW;
- inode_add_to_lists(ip->i_mount->m_super, inode);
+
+ inode_sb_list_add(inode);
+ insert_inode_hash(inode);
inode->i_mode = ip->i_d.di_mode;
inode->i_nlink = ip->i_d.di_nlink;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index ab31ce5..cf80878 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -576,7 +576,7 @@
/* Figure out maximum filesize, on Linux this can depend on
* the filesystem blocksize (on 32 bit platforms).
- * __block_prepare_write does this in an [unsigned] long...
+ * __block_write_begin does this in an [unsigned] long...
* page->index << (PAGE_CACHE_SHIFT - bbits)
* So, for page sized blocks (4K on 32 bit platforms),
* this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index fac5229..fb2ca2e 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -500,7 +500,7 @@
#define IHOLD(ip) \
do { \
ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
- atomic_inc(&(VFS_I(ip)->i_count)); \
+ ihold(VFS_I(ip)); \
trace_xfs_ihold(ip, _THIS_IP_); \
} while (0)
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index dd1b25b..68d1fe7 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -212,7 +212,6 @@
loff_t, unsigned, unsigned,
struct page *, void *);
void page_zero_new_buffers(struct page *page, unsigned from, unsigned to);
-int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*);
int cont_write_begin(struct file *, struct address_space *, loff_t,
unsigned, unsigned, struct page **, void **,
get_block_t *, loff_t *);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4658777..240eb1d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -92,6 +92,9 @@
/* Expect random access pattern */
#define FMODE_RANDOM ((__force fmode_t)0x1000)
+/* File is huge (eg. /dev/kmem): treat loff_t as unsigned */
+#define FMODE_UNSIGNED_OFFSET ((__force fmode_t)0x2000)
+
/* File was opened by fanotify and shouldn't generate fanotify events */
#define FMODE_NONOTIFY ((__force fmode_t)0x1000000)
@@ -722,7 +725,8 @@
struct inode {
struct hlist_node i_hash;
- struct list_head i_list; /* backing dev IO list */
+ struct list_head i_wb_list; /* backing dev IO list */
+ struct list_head i_lru; /* inode LRU list */
struct list_head i_sb_list;
struct list_head i_dentry;
unsigned long i_ino;
@@ -789,6 +793,11 @@
void *i_private; /* fs or device private pointer */
};
+static inline int inode_unhashed(struct inode *inode)
+{
+ return hlist_unhashed(&inode->i_hash);
+}
+
/*
* inode->i_mutex nesting subclasses for the lock validator:
*
@@ -1639,16 +1648,17 @@
*
* Q: What is the difference between I_WILL_FREE and I_FREEING?
*/
-#define I_DIRTY_SYNC 1
-#define I_DIRTY_DATASYNC 2
-#define I_DIRTY_PAGES 4
+#define I_DIRTY_SYNC (1 << 0)
+#define I_DIRTY_DATASYNC (1 << 1)
+#define I_DIRTY_PAGES (1 << 2)
#define __I_NEW 3
#define I_NEW (1 << __I_NEW)
-#define I_WILL_FREE 16
-#define I_FREEING 32
-#define I_CLEAR 64
+#define I_WILL_FREE (1 << 4)
+#define I_FREEING (1 << 5)
+#define I_CLEAR (1 << 6)
#define __I_SYNC 7
#define I_SYNC (1 << __I_SYNC)
+#define I_REFERENCED (1 << 8)
#define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
@@ -1740,6 +1750,7 @@
}
int sync_inode(struct inode *inode, struct writeback_control *wbc);
+int sync_inode_metadata(struct inode *inode, int wait);
struct file_system_type {
const char *name;
@@ -2084,7 +2095,6 @@
extern int __invalidate_device(struct block_device *);
extern int invalidate_partition(struct gendisk *, int);
#endif
-extern int invalidate_inodes(struct super_block *);
unsigned long invalidate_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t end);
@@ -2168,7 +2178,7 @@
extern int inode_init_always(struct super_block *, struct inode *);
extern void inode_init_once(struct inode *);
-extern void inode_add_to_lists(struct super_block *, struct inode *);
+extern void ihold(struct inode * inode);
extern void iput(struct inode *);
extern struct inode * igrab(struct inode *);
extern ino_t iunique(struct super_block *, ino_t);
@@ -2188,11 +2198,11 @@
extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *);
extern int insert_inode_locked(struct inode *);
extern void unlock_new_inode(struct inode *);
+extern unsigned int get_next_ino(void);
extern void __iget(struct inode * inode);
extern void iget_failed(struct inode *);
extern void end_writeback(struct inode *);
-extern void destroy_inode(struct inode *);
extern void __destroy_inode(struct inode *);
extern struct inode *new_inode(struct super_block *);
extern int should_remove_suid(struct dentry *);
@@ -2200,9 +2210,11 @@
extern void __insert_inode_hash(struct inode *, unsigned long hashval);
extern void remove_inode_hash(struct inode *);
-static inline void insert_inode_hash(struct inode *inode) {
+static inline void insert_inode_hash(struct inode *inode)
+{
__insert_inode_hash(inode, inode->i_ino);
}
+extern void inode_sb_list_add(struct inode *inode);
#ifdef CONFIG_BLOCK
extern void submit_bio(int, struct bio *);
@@ -2485,7 +2497,10 @@
struct ctl_table;
int proc_nr_files(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
-
+int proc_nr_dentry(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
+int proc_nr_inodes(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
int __init get_filesystem_list(char *buf);
#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
diff --git a/include/linux/list.h b/include/linux/list.h
index 88a0006..9a5f8a7 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -636,6 +636,12 @@
next->next->pprev = &next->next;
}
+/* after that we'll appear to be on some hlist and hlist_del will work */
+static inline void hlist_add_fake(struct hlist_node *n)
+{
+ n->pprev = &n->next;
+}
+
/*
* Move a list from one list head to another. Fixup the pprev
* reference of the first entry if it exists.
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index 91a4177..5ca47e5 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -2072,6 +2072,8 @@
void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs);
int reiserfs_setattr(struct dentry *dentry, struct iattr *attr);
+int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len);
+
/* namei.c */
void set_de_name_and_namelen(struct reiserfs_dir_entry *de);
int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index c7299d2..d5c7aaa 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -10,8 +10,6 @@
struct backing_dev_info;
extern spinlock_t inode_lock;
-extern struct list_head inode_in_use;
-extern struct list_head inode_unused;
/*
* fs/fs-writeback.c
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index e1e7b96..3a61ffe 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -116,6 +116,7 @@
inode = new_inode(sb);
if (inode) {
+ inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
@@ -769,7 +770,7 @@
inode = dentry->d_inode;
if (inode)
- atomic_inc(&inode->i_count);
+ ihold(inode);
err = mnt_want_write(ipc_ns->mq_mnt);
if (err)
goto out_err;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7b69b8d..9270d53 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -777,6 +777,7 @@
struct inode *inode = new_inode(sb);
if (inode) {
+ inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
diff --git a/kernel/futex.c b/kernel/futex.c
index a118bf1..6c683b3 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -169,7 +169,7 @@
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
case FUT_OFF_INODE:
- atomic_inc(&key->shared.inode->i_count);
+ ihold(key->shared.inode);
break;
case FUT_OFF_MMSHARED:
atomic_inc(&key->private.mm->mm_count);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 48d9d68..c33a1ed 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1338,14 +1338,14 @@
.data = &inodes_stat,
.maxlen = 2*sizeof(int),
.mode = 0444,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_nr_inodes,
},
{
.procname = "inode-state",
.data = &inodes_stat,
.maxlen = 7*sizeof(int),
.mode = 0444,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_nr_inodes,
},
{
.procname = "file-nr",
@@ -1375,7 +1375,7 @@
.data = &dentry_stat,
.maxlen = 6*sizeof(int),
.mode = 0444,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_nr_dentry,
},
{
.procname = "overflowuid",
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index f2eb278..027100d 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -74,11 +74,11 @@
nr_wb = nr_dirty = nr_io = nr_more_io = 0;
spin_lock(&inode_lock);
- list_for_each_entry(inode, &wb->b_dirty, i_list)
+ list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
nr_dirty++;
- list_for_each_entry(inode, &wb->b_io, i_list)
+ list_for_each_entry(inode, &wb->b_io, i_wb_list)
nr_io++;
- list_for_each_entry(inode, &wb->b_more_io, i_list)
+ list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
nr_more_io++;
spin_unlock(&inode_lock);
diff --git a/mm/shmem.c b/mm/shmem.c
index 080b09a..f6d350e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1586,6 +1586,7 @@
inode = new_inode(sb);
if (inode) {
+ inode->i_ino = get_next_ino();
inode_init_owner(inode, dir, mode);
inode->i_blocks = 0;
inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
@@ -1903,7 +1904,7 @@
dir->i_size += BOGO_DIRENT_SIZE;
inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
inc_nlink(inode);
- atomic_inc(&inode->i_count); /* New dentry reference */
+ ihold(inode); /* New dentry reference */
dget(dentry); /* Extra pinning count for the created dentry */
d_instantiate(dentry, inode);
out:
@@ -2146,7 +2147,7 @@
if (*len < 3)
return 255;
- if (hlist_unhashed(&inode->i_hash)) {
+ if (inode_unhashed(inode)) {
/* Unfortunately insert_inode_hash is not idempotent,
* so as we hash inodes here rather than at creation
* time, we need a lock to ensure we only try
@@ -2154,7 +2155,7 @@
*/
static DEFINE_SPINLOCK(lock);
spin_lock(&lock);
- if (hlist_unhashed(&inode->i_hash))
+ if (inode_unhashed(inode))
__insert_inode_hash(inode,
inode->i_ino + inode->i_generation);
spin_unlock(&lock);
diff --git a/net/socket.c b/net/socket.c
index 7f67c07..ee3cd28 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -377,7 +377,7 @@
&socket_file_ops);
if (unlikely(!file)) {
/* drop dentry, keep inode */
- atomic_inc(&path.dentry->d_inode->i_count);
+ ihold(path.dentry->d_inode);
path_put(&path);
put_unused_fd(fd);
return -ENFILE;
@@ -480,6 +480,7 @@
sock = SOCKET_I(inode);
kmemcheck_annotate_bitfield(sock, type);
+ inode->i_ino = get_next_ino();
inode->i_mode = S_IFSOCK | S_IRWXUGO;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 52f2524..7df92d2 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -445,6 +445,7 @@
struct inode *inode = new_inode(sb);
if (!inode)
return NULL;
+ inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
switch(mode & S_IFMT) {
diff --git a/security/apparmor/path.c b/security/apparmor/path.c
index 8239605..36cc0cc 100644
--- a/security/apparmor/path.c
+++ b/security/apparmor/path.c
@@ -72,10 +72,8 @@
path_get(&root);
}
- spin_lock(&dcache_lock);
tmp = root;
res = __d_path(path, &tmp, buf, buflen);
- spin_unlock(&dcache_lock);
*name = res;
/* handle error conditions - and still allow a partial path to
diff --git a/security/inode.c b/security/inode.c
index 8883986..cb8f47c 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -61,6 +61,7 @@
struct inode *inode = new_inode(sb);
if (inode) {
+ inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
switch (mode & S_IFMT) {
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 87e0556..55a755c 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -978,6 +978,7 @@
struct inode *ret = new_inode(sb);
if (ret) {
+ ret->i_ino = get_next_ino();
ret->i_mode = mode;
ret->i_atime = ret->i_mtime = ret->i_ctime = CURRENT_TIME;
}
diff --git a/security/tomoyo/realpath.c b/security/tomoyo/realpath.c
index ed8ccd6..1d0bf8f 100644
--- a/security/tomoyo/realpath.c
+++ b/security/tomoyo/realpath.c
@@ -127,10 +127,8 @@
/* If we don't have a vfsmount, we can't calculate. */
if (!path->mnt)
break;
- spin_lock(&dcache_lock);
/* go to whatever namespace root we are under */
pos = __d_path(path, &ns_root, buf, buf_len);
- spin_unlock(&dcache_lock);
/* Prepend "/proc" prefix if using internal proc vfs mount. */
if (!IS_ERR(pos) && (path->mnt->mnt_flags & MNT_INTERNAL) &&
(path->mnt->mnt_sb->s_magic == PROC_SUPER_MAGIC)) {