fsnotify: unified filesystem notification backend
fsnotify is a backend for filesystem notification. fsnotify does
not provide any userspace interface but does provide the basis
needed for other notification schemes such as dnotify. fsnotify
can be extended to be the backend for inotify or the upcoming
fanotify. fsnotify provides a mechanism for "groups" to register for
some set of filesystem events and to then deliver those events to
those groups for processing.
fsnotify has a number of benefits, the first being actually shrinking the size
of an inode. Before fsnotify to support both dnotify and inotify an inode had
unsigned long i_dnotify_mask; /* Directory notify events */
struct dnotify_struct *i_dnotify; /* for directory notifications */
struct list_head inotify_watches; /* watches on this inode */
struct mutex inotify_mutex; /* protects the watches list
But with fsnotify this same functionallity (and more) is done with just
__u32 i_fsnotify_mask; /* all events for this inode */
struct hlist_head i_fsnotify_mark_entries; /* marks on this inode */
That's right, inotify, dnotify, and fanotify all in 64 bits. We used that
much space just in inotify_watches alone, before this patch set.
fsnotify object lifetime and locking is MUCH better than what we have today.
inotify locking is incredibly complex. See 8f7b0ba1c8539 as an example of
what's been busted since inception. inotify needs to know internal semantics
of superblock destruction and unmounting to function. The inode pinning and
vfs contortions are horrible.
no fsnotify implementers do allocation under locks. This means things like
f04b30de3 which (due to an overabundance of caution) changes GFP_KERNEL to
GFP_NOFS can be reverted. There are no longer any allocation rules when using
or implementing your own fsnotify listener.
fsnotify paves the way for fanotify. In brief fanotify is a notification
mechanism that delivers the lisener both an 'event' and an open file descriptor
to the object in question. This means that fanotify is pathname agnostic.
Some on lkml may not care for the original companies or users that pushed for
TALPA, but fanotify was designed with flexibility and input for other users in
mind. The readahead group expressed interest in fanotify as it could be used
to profile disk access on boot without breaking the audit system. The desktop
search groups have also expressed interest in fanotify as it solves a number
of the race conditions and problems present with managing inotify when more
than a limited number of specific files are of interest. fanotify can provide
for a userspace access control system which makes it a clean interface for AV
vendors to hook without trying to do binary patching on the syscall table,
LSM, and everywhere else they do their things today. With this patch series
fanotify can be implemented in less than 1200 lines of easy to review code.
Almost all of which is the socket based user interface.
This patch series builds fsnotify to the point that it can implement
dnotify and inotify_user. Patches exist and will be sent soon after
acceptance to finish the in kernel inotify conversion (audit) and implement
fanotify.
Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 00fbd5b..6c9ebef 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -13,6 +13,7 @@
#include <linux/dnotify.h>
#include <linux/inotify.h>
+#include <linux/fsnotify_backend.h>
#include <linux/audit.h>
/*
@@ -35,6 +36,16 @@
}
/*
+ * fsnotify_link_count - inode's link count changed
+ */
+static inline void fsnotify_link_count(struct inode *inode)
+{
+ inotify_inode_queue_event(inode, IN_ATTRIB, 0, NULL, NULL);
+
+ fsnotify(inode, FS_ATTRIB, inode, FSNOTIFY_EVENT_INODE);
+}
+
+/*
* fsnotify_move - file old_name at old_dir was moved to new_name at new_dir
*/
static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
@@ -43,28 +54,47 @@
{
struct inode *source = moved->d_inode;
u32 cookie = inotify_get_cookie();
+ __u32 old_dir_mask = 0;
+ __u32 new_dir_mask = 0;
- if (old_dir == new_dir)
+ if (old_dir == new_dir) {
inode_dir_notify(old_dir, DN_RENAME);
- else {
+ old_dir_mask = FS_DN_RENAME;
+ } else {
inode_dir_notify(old_dir, DN_DELETE);
+ old_dir_mask = FS_DELETE;
inode_dir_notify(new_dir, DN_CREATE);
+ new_dir_mask = FS_CREATE;
}
- if (isdir)
+ if (isdir) {
isdir = IN_ISDIR;
+ old_dir_mask |= FS_IN_ISDIR;
+ new_dir_mask |= FS_IN_ISDIR;
+ }
+
+ old_dir_mask |= FS_MOVED_FROM;
+ new_dir_mask |= FS_MOVED_TO;
+
inotify_inode_queue_event(old_dir, IN_MOVED_FROM|isdir,cookie,old_name,
source);
inotify_inode_queue_event(new_dir, IN_MOVED_TO|isdir, cookie, new_name,
source);
+ fsnotify(old_dir, old_dir_mask, old_dir, FSNOTIFY_EVENT_INODE);
+ fsnotify(new_dir, new_dir_mask, new_dir, FSNOTIFY_EVENT_INODE);
+
if (target) {
inotify_inode_queue_event(target, IN_DELETE_SELF, 0, NULL, NULL);
inotify_inode_is_dead(target);
+
+ /* this is really a link_count change not a removal */
+ fsnotify_link_count(target);
}
if (source) {
inotify_inode_queue_event(source, IN_MOVE_SELF, 0, NULL, NULL);
+ fsnotify(source, FS_MOVE_SELF, moved->d_inode, FSNOTIFY_EVENT_INODE);
}
audit_inode_child(new_name, moved, new_dir);
}
@@ -74,10 +104,12 @@
*/
static inline void fsnotify_nameremove(struct dentry *dentry, int isdir)
{
+ __u32 mask = FS_DELETE;
+
if (isdir)
- isdir = IN_ISDIR;
+ mask |= FS_IN_ISDIR;
dnotify_parent(dentry, DN_DELETE);
- inotify_dentry_parent_queue_event(dentry, IN_DELETE|isdir, 0, dentry->d_name.name);
+ inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
}
/*
@@ -87,14 +119,8 @@
{
inotify_inode_queue_event(inode, IN_DELETE_SELF, 0, NULL, NULL);
inotify_inode_is_dead(inode);
-}
-/*
- * fsnotify_link_count - inode's link count changed
- */
-static inline void fsnotify_link_count(struct inode *inode)
-{
- inotify_inode_queue_event(inode, IN_ATTRIB, 0, NULL, NULL);
+ fsnotify(inode, FS_DELETE_SELF, inode, FSNOTIFY_EVENT_INODE);
}
/*
@@ -106,6 +132,8 @@
inotify_inode_queue_event(inode, IN_CREATE, 0, dentry->d_name.name,
dentry->d_inode);
audit_inode_child(dentry->d_name.name, dentry, inode);
+
+ fsnotify(inode, FS_CREATE, dentry->d_inode, FSNOTIFY_EVENT_INODE);
}
/*
@@ -120,6 +148,8 @@
inode);
fsnotify_link_count(inode);
audit_inode_child(new_dentry->d_name.name, new_dentry, dir);
+
+ fsnotify(dir, FS_CREATE, inode, FSNOTIFY_EVENT_INODE);
}
/*
@@ -127,10 +157,14 @@
*/
static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
{
+ __u32 mask = (FS_CREATE | FS_IN_ISDIR);
+ struct inode *d_inode = dentry->d_inode;
+
inode_dir_notify(inode, DN_CREATE);
- inotify_inode_queue_event(inode, IN_CREATE | IN_ISDIR, 0,
- dentry->d_name.name, dentry->d_inode);
+ inotify_inode_queue_event(inode, mask, 0, dentry->d_name.name, d_inode);
audit_inode_child(dentry->d_name.name, dentry, inode);
+
+ fsnotify(inode, mask, d_inode, FSNOTIFY_EVENT_INODE);
}
/*
@@ -139,14 +173,16 @@
static inline void fsnotify_access(struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
- u32 mask = IN_ACCESS;
+ __u32 mask = FS_ACCESS;
if (S_ISDIR(inode->i_mode))
- mask |= IN_ISDIR;
+ mask |= FS_IN_ISDIR;
dnotify_parent(dentry, DN_ACCESS);
inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
+
+ fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
}
/*
@@ -155,14 +191,16 @@
static inline void fsnotify_modify(struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
- u32 mask = IN_MODIFY;
+ __u32 mask = FS_MODIFY;
if (S_ISDIR(inode->i_mode))
- mask |= IN_ISDIR;
+ mask |= FS_IN_ISDIR;
dnotify_parent(dentry, DN_MODIFY);
inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
+
+ fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
}
/*
@@ -171,13 +209,15 @@
static inline void fsnotify_open(struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
- u32 mask = IN_OPEN;
+ __u32 mask = FS_OPEN;
if (S_ISDIR(inode->i_mode))
- mask |= IN_ISDIR;
+ mask |= FS_IN_ISDIR;
inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
+
+ fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
}
/*
@@ -189,13 +229,15 @@
struct inode *inode = dentry->d_inode;
const char *name = dentry->d_name.name;
fmode_t mode = file->f_mode;
- u32 mask = (mode & FMODE_WRITE) ? IN_CLOSE_WRITE : IN_CLOSE_NOWRITE;
+ __u32 mask = (mode & FMODE_WRITE) ? FS_CLOSE_WRITE : FS_CLOSE_NOWRITE;
if (S_ISDIR(inode->i_mode))
- mask |= IN_ISDIR;
+ mask |= FS_IN_ISDIR;
inotify_dentry_parent_queue_event(dentry, mask, 0, name);
inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
+
+ fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE);
}
/*
@@ -204,13 +246,15 @@
static inline void fsnotify_xattr(struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
- u32 mask = IN_ATTRIB;
+ __u32 mask = FS_ATTRIB;
if (S_ISDIR(inode->i_mode))
- mask |= IN_ISDIR;
+ mask |= FS_IN_ISDIR;
inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
+
+ fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
}
/*
@@ -221,34 +265,34 @@
{
struct inode *inode = dentry->d_inode;
int dn_mask = 0;
- u32 in_mask = 0;
+ __u32 in_mask = 0;
if (ia_valid & ATTR_UID) {
- in_mask |= IN_ATTRIB;
+ in_mask |= FS_ATTRIB;
dn_mask |= DN_ATTRIB;
}
if (ia_valid & ATTR_GID) {
- in_mask |= IN_ATTRIB;
+ in_mask |= FS_ATTRIB;
dn_mask |= DN_ATTRIB;
}
if (ia_valid & ATTR_SIZE) {
- in_mask |= IN_MODIFY;
+ in_mask |= FS_MODIFY;
dn_mask |= DN_MODIFY;
}
/* both times implies a utime(s) call */
if ((ia_valid & (ATTR_ATIME | ATTR_MTIME)) == (ATTR_ATIME | ATTR_MTIME))
{
- in_mask |= IN_ATTRIB;
+ in_mask |= FS_ATTRIB;
dn_mask |= DN_ATTRIB;
} else if (ia_valid & ATTR_ATIME) {
- in_mask |= IN_ACCESS;
+ in_mask |= FS_ACCESS;
dn_mask |= DN_ACCESS;
} else if (ia_valid & ATTR_MTIME) {
- in_mask |= IN_MODIFY;
+ in_mask |= FS_MODIFY;
dn_mask |= DN_MODIFY;
}
if (ia_valid & ATTR_MODE) {
- in_mask |= IN_ATTRIB;
+ in_mask |= FS_ATTRIB;
dn_mask |= DN_ATTRIB;
}
@@ -256,14 +300,15 @@
dnotify_parent(dentry, dn_mask);
if (in_mask) {
if (S_ISDIR(inode->i_mode))
- in_mask |= IN_ISDIR;
+ in_mask |= FS_IN_ISDIR;
inotify_inode_queue_event(inode, in_mask, 0, NULL, NULL);
inotify_dentry_parent_queue_event(dentry, in_mask, 0,
dentry->d_name.name);
+ fsnotify(inode, in_mask, inode, FSNOTIFY_EVENT_INODE);
}
}
-#ifdef CONFIG_INOTIFY /* inotify helpers */
+#if defined(CONFIG_INOTIFY) || defined(CONFIG_FSNOTIFY) /* notify helpers */
/*
* fsnotify_oldname_init - save off the old filename before we change it
@@ -281,7 +326,7 @@
kfree(old_name);
}
-#else /* CONFIG_INOTIFY */
+#else /* CONFIG_INOTIFY || CONFIG_FSNOTIFY */
static inline const char *fsnotify_oldname_init(const char *name)
{
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
new file mode 100644
index 0000000..1a55718
--- /dev/null
+++ b/include/linux/fsnotify_backend.h
@@ -0,0 +1,177 @@
+/*
+ * Filesystem access notification for Linux
+ *
+ * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ */
+
+#ifndef __LINUX_FSNOTIFY_BACKEND_H
+#define __LINUX_FSNOTIFY_BACKEND_H
+
+#ifdef __KERNEL__
+
+#include <linux/fs.h> /* struct inode */
+#include <linux/list.h>
+#include <linux/path.h> /* struct path */
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+#include <asm/atomic.h>
+
+/*
+ * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily
+ * convert between them. dnotify only needs conversion at watch creation
+ * so no perf loss there. fanotify isn't defined yet, so it can use the
+ * wholes if it needs more events.
+ */
+#define FS_ACCESS 0x00000001 /* File was accessed */
+#define FS_MODIFY 0x00000002 /* File was modified */
+#define FS_ATTRIB 0x00000004 /* Metadata changed */
+#define FS_CLOSE_WRITE 0x00000008 /* Writtable file was closed */
+#define FS_CLOSE_NOWRITE 0x00000010 /* Unwrittable file closed */
+#define FS_OPEN 0x00000020 /* File was opened */
+#define FS_MOVED_FROM 0x00000040 /* File was moved from X */
+#define FS_MOVED_TO 0x00000080 /* File was moved to Y */
+#define FS_CREATE 0x00000100 /* Subfile was created */
+#define FS_DELETE 0x00000200 /* Subfile was deleted */
+#define FS_DELETE_SELF 0x00000400 /* Self was deleted */
+#define FS_MOVE_SELF 0x00000800 /* Self was moved */
+
+#define FS_UNMOUNT 0x00002000 /* inode on umount fs */
+#define FS_Q_OVERFLOW 0x00004000 /* Event queued overflowed */
+#define FS_IN_IGNORED 0x00008000 /* last inotify event here */
+
+#define FS_IN_ISDIR 0x40000000 /* event occurred against dir */
+#define FS_IN_ONESHOT 0x80000000 /* only send event once */
+
+#define FS_DN_RENAME 0x10000000 /* file renamed */
+#define FS_DN_MULTISHOT 0x20000000 /* dnotify multishot */
+
+struct fsnotify_group;
+struct fsnotify_event;
+
+/*
+ * Each group much define these ops. The fsnotify infrastructure will call
+ * these operations for each relevant group.
+ *
+ * handle_event - main call for a group to handle an fs event
+ * free_group_priv - called when a group refcnt hits 0 to clean up the private union
+ */
+struct fsnotify_ops {
+ int (*handle_event)(struct fsnotify_group *group, struct fsnotify_event *event);
+ void (*free_group_priv)(struct fsnotify_group *group);
+};
+
+/*
+ * A group is a "thing" that wants to receive notification about filesystem
+ * events. The mask holds the subset of event types this group cares about.
+ * refcnt on a group is up to the implementor and at any moment if it goes 0
+ * everything will be cleaned up.
+ */
+struct fsnotify_group {
+ /*
+ * global list of all groups receiving events from fsnotify.
+ * anchored by fsnotify_groups and protected by either fsnotify_grp_mutex
+ * or fsnotify_grp_srcu depending on write vs read.
+ */
+ struct list_head group_list;
+
+ /*
+ * Defines all of the event types in which this group is interested.
+ * This mask is a bitwise OR of the FS_* events from above. Each time
+ * this mask changes for a group (if it changes) the correct functions
+ * must be called to update the global structures which indicate global
+ * interest in event types.
+ */
+ __u32 mask;
+
+ /*
+ * How the refcnt is used is up to each group. When the refcnt hits 0
+ * fsnotify will clean up all of the resources associated with this group.
+ * As an example, the dnotify group will always have a refcnt=1 and that
+ * will never change. Inotify, on the other hand, has a group per
+ * inotify_init() and the refcnt will hit 0 only when that fd has been
+ * closed.
+ */
+ atomic_t refcnt; /* things with interest in this group */
+ unsigned int group_num; /* simply prevents accidental group collision */
+
+ const struct fsnotify_ops *ops; /* how this group handles things */
+
+ /* prevents double list_del of group_list. protected by global fsnotify_gr_mutex */
+ bool on_group_list;
+
+ /* groups can define private fields here or use the void *private */
+ union {
+ void *private;
+ };
+};
+
+/*
+ * all of the information about the original object we want to now send to
+ * a group. If you want to carry more info from the accessing task to the
+ * listener this structure is where you need to be adding fields.
+ */
+struct fsnotify_event {
+ spinlock_t lock; /* protection for the associated event_holder and private_list */
+ /* to_tell may ONLY be dereferenced during handle_event(). */
+ struct inode *to_tell; /* either the inode the event happened to or its parent */
+ /*
+ * depending on the event type we should have either a path or inode
+ * We hold a reference on path, but NOT on inode. Since we have the ref on
+ * the path, it may be dereferenced at any point during this object's
+ * lifetime. That reference is dropped when this object's refcnt hits
+ * 0. If this event contains an inode instead of a path, the inode may
+ * ONLY be used during handle_event().
+ */
+ union {
+ struct path path;
+ struct inode *inode;
+ };
+/* when calling fsnotify tell it if the data is a path or inode */
+#define FSNOTIFY_EVENT_NONE 0
+#define FSNOTIFY_EVENT_PATH 1
+#define FSNOTIFY_EVENT_INODE 2
+#define FSNOTIFY_EVENT_FILE 3
+ int data_type; /* which of the above union we have */
+ atomic_t refcnt; /* how many groups still are using/need to send this event */
+ __u32 mask; /* the type of access, bitwise OR for FS_* event types */
+};
+
+#ifdef CONFIG_FSNOTIFY
+
+/* called from the vfs helpers */
+
+/* main fsnotify call to send events */
+extern void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is);
+
+
+/* called from fsnotify listeners, such as fanotify or dnotify */
+
+/* must call when a group changes its ->mask */
+extern void fsnotify_recalc_global_mask(void);
+/* get a reference to an existing or create a new group */
+extern struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num,
+ __u32 mask,
+ const struct fsnotify_ops *ops);
+/* drop reference on a group from fsnotify_obtain_group */
+extern void fsnotify_put_group(struct fsnotify_group *group);
+
+/* take a reference to an event */
+extern void fsnotify_get_event(struct fsnotify_event *event);
+extern void fsnotify_put_event(struct fsnotify_event *event);
+/* find private data previously attached to an event */
+extern struct fsnotify_event_private_data *fsnotify_get_priv_from_event(struct fsnotify_group *group,
+ struct fsnotify_event *event);
+
+/* put here because inotify does some weird stuff when destroying watches */
+extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
+ void *data, int data_is);
+#else
+
+static inline void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is)
+{}
+#endif /* CONFIG_FSNOTIFY */
+
+#endif /* __KERNEL __ */
+
+#endif /* __LINUX_FSNOTIFY_BACKEND_H */