Blame - fs/eventpoll.c - kernel/msm-4.9

blob: 88a6f8d0b88e2bda091e687a3213bd3107714b84 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* fs/eventpoll.c ( Efficent event polling implementation )
Davide Libenzi	3419b23	2006-06-25 05:48:14 -0700	[diff] [blame]	3	* Copyright (C) 2001,...,2006 Davide Libenzi
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	4	*
				5	* This program is free software; you can redistribute it and/or modify
				6	* it under the terms of the GNU General Public License as published by
				7	* the Free Software Foundation; either version 2 of the License, or
				8	* (at your option) any later version.
				9	*
				10	* Davide Libenzi <davidel@xmailserver.org>
				11	*
				12	*/
				13
				14	#include <linux/module.h>
				15	#include <linux/init.h>
				16	#include <linux/kernel.h>
				17	#include <linux/sched.h>
				18	#include <linux/fs.h>
				19	#include <linux/file.h>
				20	#include <linux/signal.h>
				21	#include <linux/errno.h>
				22	#include <linux/mm.h>
				23	#include <linux/slab.h>
				24	#include <linux/poll.h>
				25	#include <linux/smp_lock.h>
				26	#include <linux/string.h>
				27	#include <linux/list.h>
				28	#include <linux/hash.h>
				29	#include <linux/spinlock.h>
				30	#include <linux/syscalls.h>
				31	#include <linux/rwsem.h>
				32	#include <linux/rbtree.h>
				33	#include <linux/wait.h>
				34	#include <linux/eventpoll.h>
				35	#include <linux/mount.h>
				36	#include <linux/bitops.h>
Arjan van de Ven	144efe3	2006-03-23 03:00:32 -0800	[diff] [blame]	37	#include <linux/mutex.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	38	#include <asm/uaccess.h>
				39	#include <asm/system.h>
				40	#include <asm/io.h>
				41	#include <asm/mman.h>
				42	#include <asm/atomic.h>
				43	#include <asm/semaphore.h>
				44
				45
				46	/*
				47	* LOCKING:
				48	* There are three level of locking required by epoll :
				49	*
Arjan van de Ven	144efe3	2006-03-23 03:00:32 -0800	[diff] [blame]	50	* 1) epmutex (mutex)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	51	* 2) ep->sem (rw_semaphore)
				52	* 3) ep->lock (rw_lock)
				53	*
				54	* The acquire order is the one listed above, from 1 to 3.
				55	* We need a spinlock (ep->lock) because we manipulate objects
				56	* from inside the poll callback, that might be triggered from
				57	* a wake_up() that in turn might be called from IRQ context.
				58	* So we can't sleep inside the poll callback and hence we need
				59	* a spinlock. During the event transfer loop (from kernel to
				60	* user space) we could end up sleeping due a copy_to_user(), so
				61	* we need a lock that will allow us to sleep. This lock is a
				62	* read-write semaphore (ep->sem). It is acquired on read during
				63	* the event transfer loop and in write during epoll_ctl(EPOLL_CTL_DEL)
				64	* and during eventpoll_release_file(). Then we also need a global
				65	* semaphore to serialize eventpoll_release_file() and ep_free().
				66	* This semaphore is acquired by ep_free() during the epoll file
				67	* cleanup path and it is also acquired by eventpoll_release_file()
				68	* if a file has been pushed inside an epoll set and it is then
				69	* close()d without a previous call toepoll_ctl(EPOLL_CTL_DEL).
				70	* It is possible to drop the "ep->sem" and to use the global
Arjan van de Ven	144efe3	2006-03-23 03:00:32 -0800	[diff] [blame]	71	* semaphore "epmutex" (together with "ep->lock") to have it working,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	72	* but having "ep->sem" will make the interface more scalable.
Arjan van de Ven	144efe3	2006-03-23 03:00:32 -0800	[diff] [blame]	73	* Events that require holding "epmutex" are very rare, while for
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	74	* normal operations the epoll private "ep->sem" will guarantee
				75	* a greater scalability.
				76	*/
				77
				78
				79	#define EVENTPOLLFS_MAGIC 0x03111965 /* My birthday should work for this :) */
				80
				81	#define DEBUG_EPOLL 0
				82
				83	#if DEBUG_EPOLL > 0
				84	#define DPRINTK(x) printk x
				85	#define DNPRINTK(n, x) do { if ((n) <= DEBUG_EPOLL) printk x; } while (0)
				86	#else /* #if DEBUG_EPOLL > 0 */
				87	#define DPRINTK(x) (void) 0
				88	#define DNPRINTK(n, x) (void) 0
				89	#endif /* #if DEBUG_EPOLL > 0 */
				90
				91	#define DEBUG_EPI 0
				92
				93	#if DEBUG_EPI != 0
				94	#define EPI_SLAB_DEBUG (SLAB_DEBUG_FREE \| SLAB_RED_ZONE /* \| SLAB_POISON */)
				95	#else /* #if DEBUG_EPI != 0 */
				96	#define EPI_SLAB_DEBUG 0
				97	#endif /* #if DEBUG_EPI != 0 */
				98
				99	/* Epoll private bits inside the event mask */
				100	#define EP_PRIVATE_BITS (EPOLLONESHOT \| EPOLLET)
				101
				102	/* Maximum number of poll wake up nests we are allowing */
				103	#define EP_MAX_POLLWAKE_NESTS 4
				104
Davide Libenzi	e3306dd	2005-09-27 21:45:33 -0700	[diff] [blame]	105	/* Maximum msec timeout value storeable in a long int */
				106	#define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ)
				107
Davide Libenzi	b611967	2006-10-11 01:21:44 -0700	[diff] [blame]	108	#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
				109
Davide Libenzi	e3306dd	2005-09-27 21:45:33 -0700	[diff] [blame]	110
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	111	struct epoll_filefd {
				112	struct file *file;
				113	int fd;
				114	};
				115
				116	/*
				117	* Node that is linked into the "wake_task_list" member of the "struct poll_safewake".
				118	* It is used to keep track on all tasks that are currently inside the wake_up() code
				119	* to 1) short-circuit the one coming from the same task and same wait queue head
				120	* ( loop ) 2) allow a maximum number of epoll descriptors inclusion nesting
				121	* 3) let go the ones coming from other tasks.
				122	*/
				123	struct wake_task_node {
				124	struct list_head llink;
Ingo Molnar	36c8b58	2006-07-03 00:25:41 -0700	[diff] [blame]	125	struct task_struct *task;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	126	wait_queue_head_t *wq;
				127	};
				128
				129	/*
				130	* This is used to implement the safe poll wake up avoiding to reenter
				131	* the poll callback from inside wake_up().
				132	*/
				133	struct poll_safewake {
				134	struct list_head wake_task_list;
				135	spinlock_t lock;
				136	};
				137
				138	/*
				139	* This structure is stored inside the "private_data" member of the file
				140	* structure and rapresent the main data sructure for the eventpoll
				141	* interface.
				142	*/
				143	struct eventpoll {
				144	/* Protect the this structure access */
				145	rwlock_t lock;
				146
				147	/*
				148	* This semaphore is used to ensure that files are not removed
				149	* while epoll is using them. This is read-held during the event
				150	* collection loop and it is write-held during the file cleanup
				151	* path, the epoll file exit code and the ctl operations.
				152	*/
				153	struct rw_semaphore sem;
				154
				155	/* Wait queue used by sys_epoll_wait() */
				156	wait_queue_head_t wq;
				157
				158	/* Wait queue used by file->poll() */
				159	wait_queue_head_t poll_wait;
				160
				161	/* List of ready file descriptors */
				162	struct list_head rdllist;
				163
				164	/* RB-Tree root used to store monitored fd structs */
				165	struct rb_root rbr;
				166	};
				167
				168	/* Wait structure used by the poll hooks */
				169	struct eppoll_entry {
				170	/* List header used to link this structure to the "struct epitem" */
				171	struct list_head llink;
				172
				173	/* The "base" pointer is set to the container "struct epitem" */
				174	void *base;
				175
				176	/*
				177	* Wait queue item that will be linked to the target file wait
				178	* queue head.
				179	*/
				180	wait_queue_t wait;
				181
				182	/* The wait queue head that linked the "wait" wait queue item */
				183	wait_queue_head_t *whead;
				184	};
				185
				186	/*
				187	* Each file descriptor added to the eventpoll interface will
				188	* have an entry of this type linked to the hash.
				189	*/
				190	struct epitem {
				191	/* RB-Tree node used to link this structure to the eventpoll rb-tree */
				192	struct rb_node rbn;
				193
				194	/* List header used to link this structure to the eventpoll ready list */
				195	struct list_head rdllink;
				196
				197	/* The file descriptor information this item refers to */
				198	struct epoll_filefd ffd;
				199
				200	/* Number of active wait queue attached to poll operations */
				201	int nwait;
				202
				203	/* List containing poll wait queues */
				204	struct list_head pwqlist;
				205
				206	/* The "container" of this item */
				207	struct eventpoll *ep;
				208
				209	/* The structure that describe the interested events and the source fd */
				210	struct epoll_event event;
				211
				212	/*
				213	* Used to keep track of the usage count of the structure. This avoids
				214	* that the structure will desappear from underneath our processing.
				215	*/
				216	atomic_t usecnt;
				217
				218	/* List header used to link this item to the "struct file" items list */
				219	struct list_head fllink;
				220
				221	/* List header used to link the item to the transfer list */
				222	struct list_head txlink;
				223
				224	/*
				225	* This is used during the collection/transfer of events to userspace
				226	* to pin items empty events set.
				227	*/
				228	unsigned int revents;
				229	};
				230
				231	/* Wrapper struct used by poll queueing */
				232	struct ep_pqueue {
				233	poll_table pt;
				234	struct epitem *epi;
				235	};
				236
				237
				238
				239	static void ep_poll_safewake_init(struct poll_safewake *psw);
				240	static void ep_poll_safewake(struct poll_safewake psw, wait_queue_head_t wq);
Davide Libenzi	53d2be7	2005-09-16 19:28:06 -0700	[diff] [blame]	241	static int ep_getfd(int efd, struct inode einode, struct file *efile,
				242	struct eventpoll *ep);
				243	static int ep_alloc(struct eventpoll **pep);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	244	static void ep_free(struct eventpoll *ep);
				245	static struct epitem ep_find(struct eventpoll ep, struct file *file, int fd);
				246	static void ep_use_epitem(struct epitem *epi);
				247	static void ep_release_epitem(struct epitem *epi);
				248	static void ep_ptable_queue_proc(struct file file, wait_queue_head_t whead,
				249	poll_table *pt);
				250	static void ep_rbtree_insert(struct eventpoll ep, struct epitem epi);
				251	static int ep_insert(struct eventpoll ep, struct epoll_event event,
				252	struct file *tfile, int fd);
				253	static int ep_modify(struct eventpoll ep, struct epitem epi,
				254	struct epoll_event *event);
				255	static void ep_unregister_pollwait(struct eventpoll ep, struct epitem epi);
				256	static int ep_unlink(struct eventpoll ep, struct epitem epi);
				257	static int ep_remove(struct eventpoll ep, struct epitem epi);
				258	static int ep_poll_callback(wait_queue_t wait, unsigned mode, int sync, void key);
				259	static int ep_eventpoll_close(struct inode inode, struct file file);
				260	static unsigned int ep_eventpoll_poll(struct file file, poll_table wait);
				261	static int ep_collect_ready_items(struct eventpoll *ep,
				262	struct list_head *txlist, int maxevents);
				263	static int ep_send_events(struct eventpoll ep, struct list_head txlist,
				264	struct epoll_event __user *events);
				265	static void ep_reinject_items(struct eventpoll ep, struct list_head txlist);
				266	static int ep_events_transfer(struct eventpoll *ep,
				267	struct epoll_event __user *events,
				268	int maxevents);
				269	static int ep_poll(struct eventpoll ep, struct epoll_event __user events,
				270	int maxevents, long timeout);
				271	static int eventpollfs_delete_dentry(struct dentry *dentry);
				272	static struct inode *ep_eventpoll_inode(void);
David Howells	454e239	2006-06-23 02:02:57 -0700	[diff] [blame]	273	static int eventpollfs_get_sb(struct file_system_type *fs_type,
				274	int flags, const char *dev_name,
				275	void data, struct vfsmount mnt);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	276
				277	/*
				278	* This semaphore is used to serialize ep_free() and eventpoll_release_file().
				279	*/
Arjan van de Ven	144efe3	2006-03-23 03:00:32 -0800	[diff] [blame]	280	static struct mutex epmutex;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	281
				282	/* Safe wake up implementation */
				283	static struct poll_safewake psw;
				284
				285	/* Slab cache used to allocate "struct epitem" */
Christoph Lameter	e18b890	2006-12-06 20:33:20 -0800	[diff] [blame]	286	static struct kmem_cache *epi_cache __read_mostly;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	287
				288	/* Slab cache used to allocate "struct eppoll_entry" */
Christoph Lameter	e18b890	2006-12-06 20:33:20 -0800	[diff] [blame]	289	static struct kmem_cache *pwq_cache __read_mostly;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	290
				291	/* Virtual fs used to allocate inodes for eventpoll files */
Eric Dumazet	fa3536c	2006-03-26 01:37:24 -0800	[diff] [blame]	292	static struct vfsmount *eventpoll_mnt __read_mostly;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	293
				294	/* File callbacks that implement the eventpoll file behaviour */
Arjan van de Ven	4b6f5d2	2006-03-28 01:56:42 -0800	[diff] [blame]	295	static const struct file_operations eventpoll_fops = {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	296	.release = ep_eventpoll_close,
				297	.poll = ep_eventpoll_poll
				298	};
				299
				300	/*
				301	* This is used to register the virtual file system from where
				302	* eventpoll inodes are allocated.
				303	*/
				304	static struct file_system_type eventpoll_fs_type = {
				305	.name = "eventpollfs",
				306	.get_sb = eventpollfs_get_sb,
				307	.kill_sb = kill_anon_super,
				308	};
				309
				310	/* Very basic directory entry operations for the eventpoll virtual file system */
				311	static struct dentry_operations eventpollfs_dentry_operations = {
				312	.d_delete = eventpollfs_delete_dentry,
				313	};
				314
				315
				316
Pekka Enberg	b030a4d	2005-06-23 00:10:03 -0700	[diff] [blame]	317	/* Fast test to see if the file is an evenpoll file */
				318	static inline int is_file_epoll(struct file *f)
				319	{
				320	return f->f_op == &eventpoll_fops;
				321	}
				322
				323	/* Setup the structure that is used as key for the rb-tree */
				324	static inline void ep_set_ffd(struct epoll_filefd *ffd,
				325	struct file *file, int fd)
				326	{
				327	ffd->file = file;
				328	ffd->fd = fd;
				329	}
				330
				331	/* Compare rb-tree keys */
				332	static inline int ep_cmp_ffd(struct epoll_filefd *p1,
				333	struct epoll_filefd *p2)
				334	{
				335	return (p1->file > p2->file ? +1:
				336	(p1->file < p2->file ? -1 : p1->fd - p2->fd));
				337	}
				338
				339	/* Special initialization for the rb-tree node to detect linkage */
				340	static inline void ep_rb_initnode(struct rb_node *n)
				341	{
David Woodhouse	c569882	2006-04-21 13:17:24 +0100	[diff] [blame]	342	rb_set_parent(n, n);
Pekka Enberg	b030a4d	2005-06-23 00:10:03 -0700	[diff] [blame]	343	}
				344
				345	/* Removes a node from the rb-tree and marks it for a fast is-linked check */
				346	static inline void ep_rb_erase(struct rb_node n, struct rb_root r)
				347	{
				348	rb_erase(n, r);
David Woodhouse	c569882	2006-04-21 13:17:24 +0100	[diff] [blame]	349	rb_set_parent(n, n);
Pekka Enberg	b030a4d	2005-06-23 00:10:03 -0700	[diff] [blame]	350	}
				351
				352	/* Fast check to verify that the item is linked to the main rb-tree */
				353	static inline int ep_rb_linked(struct rb_node *n)
				354	{
David Woodhouse	c569882	2006-04-21 13:17:24 +0100	[diff] [blame]	355	return rb_parent(n) != n;
Pekka Enberg	b030a4d	2005-06-23 00:10:03 -0700	[diff] [blame]	356	}
				357
				358	/*
				359	* Remove the item from the list and perform its initialization.
				360	* This is useful for us because we can test if the item is linked
				361	* using "ep_is_linked(p)".
				362	*/
				363	static inline void ep_list_del(struct list_head *p)
				364	{
				365	list_del(p);
				366	INIT_LIST_HEAD(p);
				367	}
				368
				369	/* Tells us if the item is currently linked */
				370	static inline int ep_is_linked(struct list_head *p)
				371	{
				372	return !list_empty(p);
				373	}
				374
				375	/* Get the "struct epitem" from a wait queue pointer */
				376	static inline struct epitem * ep_item_from_wait(wait_queue_t *p)
				377	{
				378	return container_of(p, struct eppoll_entry, wait)->base;
				379	}
				380
				381	/* Get the "struct epitem" from an epoll queue wrapper */
				382	static inline struct epitem * ep_item_from_epqueue(poll_table *p)
				383	{
				384	return container_of(p, struct ep_pqueue, pt)->epi;
				385	}
				386
				387	/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
				388	static inline int ep_op_hash_event(int op)
				389	{
				390	return op != EPOLL_CTL_DEL;
				391	}
				392
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	393	/* Initialize the poll safe wake up structure */
				394	static void ep_poll_safewake_init(struct poll_safewake *psw)
				395	{
				396
				397	INIT_LIST_HEAD(&psw->wake_task_list);
				398	spin_lock_init(&psw->lock);
				399	}
				400
				401
				402	/*
				403	* Perform a safe wake up of the poll wait list. The problem is that
				404	* with the new callback'd wake up system, it is possible that the
				405	* poll callback is reentered from inside the call to wake_up() done
				406	* on the poll wait queue head. The rule is that we cannot reenter the
				407	* wake up code from the same task more than EP_MAX_POLLWAKE_NESTS times,
				408	* and we cannot reenter the same wait queue head at all. This will
				409	* enable to have a hierarchy of epoll file descriptor of no more than
				410	* EP_MAX_POLLWAKE_NESTS deep. We need the irq version of the spin lock
				411	* because this one gets called by the poll callback, that in turn is called
				412	* from inside a wake_up(), that might be called from irq context.
				413	*/
				414	static void ep_poll_safewake(struct poll_safewake psw, wait_queue_head_t wq)
				415	{
				416	int wake_nests = 0;
				417	unsigned long flags;
Ingo Molnar	36c8b58	2006-07-03 00:25:41 -0700	[diff] [blame]	418	struct task_struct *this_task = current;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	419	struct list_head lsthead = &psw->wake_task_list, lnk;
				420	struct wake_task_node *tncur;
				421	struct wake_task_node tnode;
				422
				423	spin_lock_irqsave(&psw->lock, flags);
				424
				425	/* Try to see if the current task is already inside this wakeup call */
				426	list_for_each(lnk, lsthead) {
				427	tncur = list_entry(lnk, struct wake_task_node, llink);
				428
				429	if (tncur->wq == wq \|\|
				430	(tncur->task == this_task && ++wake_nests > EP_MAX_POLLWAKE_NESTS)) {
				431	/*
				432	* Ops ... loop detected or maximum nest level reached.
				433	* We abort this wake by breaking the cycle itself.
				434	*/
				435	spin_unlock_irqrestore(&psw->lock, flags);
				436	return;
				437	}
				438	}
				439
				440	/* Add the current task to the list */
				441	tnode.task = this_task;
				442	tnode.wq = wq;
				443	list_add(&tnode.llink, lsthead);
				444
				445	spin_unlock_irqrestore(&psw->lock, flags);
				446
				447	/* Do really wake up now */
				448	wake_up(wq);
				449
				450	/* Remove the current task from the list */
				451	spin_lock_irqsave(&psw->lock, flags);
				452	list_del(&tnode.llink);
				453	spin_unlock_irqrestore(&psw->lock, flags);
				454	}
				455
				456
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	457	/*
				458	* This is called from eventpoll_release() to unlink files from the eventpoll
				459	* interface. We need to have this facility to cleanup correctly files that are
				460	* closed without being removed from the eventpoll interface.
				461	*/
				462	void eventpoll_release_file(struct file *file)
				463	{
				464	struct list_head *lsthead = &file->f_ep_links;
				465	struct eventpoll *ep;
				466	struct epitem *epi;
				467
				468	/*
				469	* We don't want to get "file->f_ep_lock" because it is not
				470	* necessary. It is not necessary because we're in the "struct file"
				471	* cleanup path, and this means that noone is using this file anymore.
				472	* The only hit might come from ep_free() but by holding the semaphore
				473	* will correctly serialize the operation. We do need to acquire
Arjan van de Ven	144efe3	2006-03-23 03:00:32 -0800	[diff] [blame]	474	* "ep->sem" after "epmutex" because ep_remove() requires it when called
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	475	* from anywhere but ep_free().
				476	*/
Arjan van de Ven	144efe3	2006-03-23 03:00:32 -0800	[diff] [blame]	477	mutex_lock(&epmutex);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	478
				479	while (!list_empty(lsthead)) {
				480	epi = list_entry(lsthead->next, struct epitem, fllink);
				481
				482	ep = epi->ep;
Pekka Enberg	b030a4d	2005-06-23 00:10:03 -0700	[diff] [blame]	483	ep_list_del(&epi->fllink);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	484	down_write(&ep->sem);
				485	ep_remove(ep, epi);
				486	up_write(&ep->sem);
				487	}
				488
Arjan van de Ven	144efe3	2006-03-23 03:00:32 -0800	[diff] [blame]	489	mutex_unlock(&epmutex);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	490	}
				491
				492
				493	/*
				494	* It opens an eventpoll file descriptor by suggesting a storage of "size"
				495	* file descriptors. The size parameter is just an hint about how to size
				496	* data structures. It won't prevent the user to store more than "size"
				497	* file descriptors inside the epoll interface. It is the kernel part of
				498	* the userspace epoll_create(2).
				499	*/
				500	asmlinkage long sys_epoll_create(int size)
				501	{
Davide Libenzi	b611967	2006-10-11 01:21:44 -0700	[diff] [blame]	502	int error, fd = -1;
Davide Libenzi	53d2be7	2005-09-16 19:28:06 -0700	[diff] [blame]	503	struct eventpoll *ep;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	504	struct inode *inode;
				505	struct file *file;
				506
				507	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
				508	current, size));
				509
Davide Libenzi	53d2be7	2005-09-16 19:28:06 -0700	[diff] [blame]	510	/*
				511	* Sanity check on the size parameter, and create the internal data
				512	* structure ( "struct eventpoll" ).
				513	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	514	error = -EINVAL;
Davide Libenzi	53d2be7	2005-09-16 19:28:06 -0700	[diff] [blame]	515	if (size <= 0 \|\| (error = ep_alloc(&ep)) != 0)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	516	goto eexit_1;
				517
				518	/*
				519	* Creates all the items needed to setup an eventpoll file. That is,
				520	* a file structure, and inode and a free file descriptor.
				521	*/
Davide Libenzi	53d2be7	2005-09-16 19:28:06 -0700	[diff] [blame]	522	error = ep_getfd(&fd, &inode, &file, ep);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	523	if (error)
				524	goto eexit_2;
				525
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	526	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
				527	current, size, fd));
				528
				529	return fd;
				530
				531	eexit_2:
Davide Libenzi	53d2be7	2005-09-16 19:28:06 -0700	[diff] [blame]	532	ep_free(ep);
				533	kfree(ep);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	534	eexit_1:
				535	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
				536	current, size, error));
				537	return error;
				538	}
				539
				540
				541	/*
				542	* The following function implements the controller interface for
				543	* the eventpoll file that enables the insertion/removal/change of
				544	* file descriptors inside the interest set. It represents
				545	* the kernel part of the user space epoll_ctl(2).
				546	*/
				547	asmlinkage long
				548	sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)
				549	{
				550	int error;
				551	struct file file, tfile;
				552	struct eventpoll *ep;
				553	struct epitem *epi;
				554	struct epoll_event epds;
				555
				556	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
				557	current, epfd, op, fd, event));
				558
				559	error = -EFAULT;
Pekka Enberg	b030a4d	2005-06-23 00:10:03 -0700	[diff] [blame]	560	if (ep_op_hash_event(op) &&
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	561	copy_from_user(&epds, event, sizeof(struct epoll_event)))
				562	goto eexit_1;
				563
				564	/* Get the "struct file " for the eventpoll file /
				565	error = -EBADF;
				566	file = fget(epfd);
				567	if (!file)
				568	goto eexit_1;
				569
				570	/* Get the "struct file " for the target file /
				571	tfile = fget(fd);
				572	if (!tfile)
				573	goto eexit_2;
				574
				575	/* The target file descriptor must support poll */
				576	error = -EPERM;
				577	if (!tfile->f_op \|\| !tfile->f_op->poll)
				578	goto eexit_3;
				579
				580	/*
				581	* We have to check that the file structure underneath the file descriptor
				582	* the user passed to us _is_ an eventpoll file. And also we do not permit
				583	* adding an epoll file descriptor inside itself.
				584	*/
				585	error = -EINVAL;
Pekka Enberg	b030a4d	2005-06-23 00:10:03 -0700	[diff] [blame]	586	if (file == tfile \|\| !is_file_epoll(file))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	587	goto eexit_3;
				588
				589	/*
				590	* At this point it is safe to assume that the "private_data" contains
				591	* our own data structure.
				592	*/
				593	ep = file->private_data;
				594
				595	down_write(&ep->sem);
				596
				597	/* Try to lookup the file inside our hash table */
				598	epi = ep_find(ep, tfile, fd);
				599
				600	error = -EINVAL;
				601	switch (op) {
				602	case EPOLL_CTL_ADD:
				603	if (!epi) {
Davide Libenzi	2395140	2006-04-10 22:54:12 -0700	[diff] [blame]	604	epds.events \|= POLLERR \| POLLHUP;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	605
				606	error = ep_insert(ep, &epds, tfile, fd);
				607	} else
				608	error = -EEXIST;
				609	break;
				610	case EPOLL_CTL_DEL:
				611	if (epi)
				612	error = ep_remove(ep, epi);
				613	else
				614	error = -ENOENT;
				615	break;
				616	case EPOLL_CTL_MOD:
				617	if (epi) {
Davide Libenzi	2395140	2006-04-10 22:54:12 -0700	[diff] [blame]	618	epds.events \|= POLLERR \| POLLHUP;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	619	error = ep_modify(ep, epi, &epds);
				620	} else
				621	error = -ENOENT;
				622	break;
				623	}
				624
				625	/*
				626	* The function ep_find() increments the usage count of the structure
				627	* so, if this is not NULL, we need to release it.
				628	*/
				629	if (epi)
				630	ep_release_epitem(epi);
				631
				632	up_write(&ep->sem);
				633
				634	eexit_3:
				635	fput(tfile);
				636	eexit_2:
				637	fput(file);
				638	eexit_1:
				639	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
				640	current, epfd, op, fd, event, error));
				641
				642	return error;
				643	}
				644
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	645
				646	/*
				647	* Implement the event wait interface for the eventpoll file. It is the kernel
				648	* part of the user space epoll_wait(2).
				649	*/
				650	asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
				651	int maxevents, int timeout)
				652	{
				653	int error;
				654	struct file *file;
				655	struct eventpoll *ep;
				656
				657	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",
				658	current, epfd, events, maxevents, timeout));
				659
				660	/* The maximum number of event must be greater than zero */
Davide Libenzi	b611967	2006-10-11 01:21:44 -0700	[diff] [blame]	661	if (maxevents <= 0 \|\| maxevents > EP_MAX_EVENTS)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	662	return -EINVAL;
				663
				664	/* Verify that the area passed by the user is writeable */
				665	if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) {
				666	error = -EFAULT;
				667	goto eexit_1;
				668	}
				669
				670	/* Get the "struct file " for the eventpoll file /
				671	error = -EBADF;
				672	file = fget(epfd);
				673	if (!file)
				674	goto eexit_1;
				675
				676	/*
				677	* We have to check that the file structure underneath the fd
				678	* the user passed to us _is_ an eventpoll file.
				679	*/
				680	error = -EINVAL;
Pekka Enberg	b030a4d	2005-06-23 00:10:03 -0700	[diff] [blame]	681	if (!is_file_epoll(file))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	682	goto eexit_2;
				683
				684	/*
				685	* At this point it is safe to assume that the "private_data" contains
				686	* our own data structure.
				687	*/
				688	ep = file->private_data;
				689
				690	/* Time to fish for events ... */
				691	error = ep_poll(ep, events, maxevents, timeout);
				692
				693	eexit_2:
				694	fput(file);
				695	eexit_1:
				696	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",
				697	current, epfd, events, maxevents, timeout, error));
				698
				699	return error;
				700	}
				701
				702
Davide Libenzi	b611967	2006-10-11 01:21:44 -0700	[diff] [blame]	703	#ifdef TIF_RESTORE_SIGMASK
				704
				705	/*
				706	* Implement the event wait interface for the eventpoll file. It is the kernel
				707	* part of the user space epoll_pwait(2).
				708	*/
				709	asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
				710	int maxevents, int timeout, const sigset_t __user *sigmask,
				711	size_t sigsetsize)
				712	{
				713	int error;
				714	sigset_t ksigmask, sigsaved;
				715
				716	/*
				717	* If the caller wants a certain signal mask to be set during the wait,
				718	* we apply it here.
				719	*/
				720	if (sigmask) {
				721	if (sigsetsize != sizeof(sigset_t))
				722	return -EINVAL;
				723	if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
				724	return -EFAULT;
				725	sigdelsetmask(&ksigmask, sigmask(SIGKILL) \| sigmask(SIGSTOP));
				726	sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
				727	}
				728
				729	error = sys_epoll_wait(epfd, events, maxevents, timeout);
				730
				731	/*
				732	* If we changed the signal mask, we need to restore the original one.
				733	* In case we've got a signal while waiting, we do not restore the
				734	* signal mask yet, and we allow do_signal() to deliver the signal on
				735	* the way back to userspace, before the signal mask is restored.
				736	*/
				737	if (sigmask) {
				738	if (error == -EINTR) {
				739	memcpy(&current->saved_sigmask, &sigsaved,
				740	sizeof(sigsaved));
				741	set_thread_flag(TIF_RESTORE_SIGMASK);
				742	} else
				743	sigprocmask(SIG_SETMASK, &sigsaved, NULL);
				744	}
				745
				746	return error;
				747	}
				748
				749	#endif /* #ifdef TIF_RESTORE_SIGMASK */
				750
				751
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	752	/*
				753	* Creates the file descriptor to be used by the epoll interface.
				754	*/
Davide Libenzi	53d2be7	2005-09-16 19:28:06 -0700	[diff] [blame]	755	static int ep_getfd(int efd, struct inode einode, struct file *efile,
				756	struct eventpoll *ep)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	757	{
				758	struct qstr this;
				759	char name[32];
				760	struct dentry *dentry;
				761	struct inode *inode;
				762	struct file *file;
				763	int error, fd;
				764
				765	/* Get an ready to use file */
				766	error = -ENFILE;
				767	file = get_empty_filp();
				768	if (!file)
				769	goto eexit_1;
				770
				771	/* Allocates an inode from the eventpoll file system */
				772	inode = ep_eventpoll_inode();
Jeff Garzik	c3b6571	2006-10-03 01:13:52 -0700	[diff] [blame]	773	if (IS_ERR(inode)) {
				774	error = PTR_ERR(inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	775	goto eexit_2;
Jeff Garzik	c3b6571	2006-10-03 01:13:52 -0700	[diff] [blame]	776	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	777
				778	/* Allocates a free descriptor to plug the file onto */
				779	error = get_unused_fd();
				780	if (error < 0)
				781	goto eexit_3;
				782	fd = error;
				783
				784	/*
				785	* Link the inode to a directory entry by creating a unique name
				786	* using the inode number.
				787	*/
				788	error = -ENOMEM;
				789	sprintf(name, "[%lu]", inode->i_ino);
				790	this.name = name;
				791	this.len = strlen(name);
				792	this.hash = inode->i_ino;
				793	dentry = d_alloc(eventpoll_mnt->mnt_sb->s_root, &this);
				794	if (!dentry)
				795	goto eexit_4;
				796	dentry->d_op = &eventpollfs_dentry_operations;
				797	d_add(dentry, inode);
				798	file->f_vfsmnt = mntget(eventpoll_mnt);
				799	file->f_dentry = dentry;
				800	file->f_mapping = inode->i_mapping;
				801
				802	file->f_pos = 0;
				803	file->f_flags = O_RDONLY;
				804	file->f_op = &eventpoll_fops;
				805	file->f_mode = FMODE_READ;
				806	file->f_version = 0;
Davide Libenzi	53d2be7	2005-09-16 19:28:06 -0700	[diff] [blame]	807	file->private_data = ep;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	808
				809	/* Install the new setup file into the allocated fd. */
				810	fd_install(fd, file);
				811
				812	*efd = fd;
				813	*einode = inode;
				814	*efile = file;
				815	return 0;
				816
				817	eexit_4:
				818	put_unused_fd(fd);
				819	eexit_3:
				820	iput(inode);
				821	eexit_2:
				822	put_filp(file);
				823	eexit_1:
				824	return error;
				825	}
				826
				827
Davide Libenzi	53d2be7	2005-09-16 19:28:06 -0700	[diff] [blame]	828	static int ep_alloc(struct eventpoll **pep)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	829	{
Davide Libenzi	53d2be7	2005-09-16 19:28:06 -0700	[diff] [blame]	830	struct eventpoll ep = kzalloc(sizeof(ep), GFP_KERNEL);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	831
Davide Libenzi	53d2be7	2005-09-16 19:28:06 -0700	[diff] [blame]	832	if (!ep)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	833	return -ENOMEM;
				834
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	835	rwlock_init(&ep->lock);
				836	init_rwsem(&ep->sem);
				837	init_waitqueue_head(&ep->wq);
				838	init_waitqueue_head(&ep->poll_wait);
				839	INIT_LIST_HEAD(&ep->rdllist);
				840	ep->rbr = RB_ROOT;
				841
Davide Libenzi	53d2be7	2005-09-16 19:28:06 -0700	[diff] [blame]	842	*pep = ep;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	843
Davide Libenzi	53d2be7	2005-09-16 19:28:06 -0700	[diff] [blame]	844	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n",
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	845	current, ep));
				846	return 0;
				847	}
				848
				849
				850	static void ep_free(struct eventpoll *ep)
				851	{
				852	struct rb_node *rbp;
				853	struct epitem *epi;
				854
				855	/* We need to release all tasks waiting for these file */
				856	if (waitqueue_active(&ep->poll_wait))
				857	ep_poll_safewake(&psw, &ep->poll_wait);
				858
				859	/*
				860	* We need to lock this because we could be hit by
				861	* eventpoll_release_file() while we're freeing the "struct eventpoll".
				862	* We do not need to hold "ep->sem" here because the epoll file
				863	* is on the way to be removed and no one has references to it
				864	* anymore. The only hit might come from eventpoll_release_file() but
Arjan van de Ven	144efe3	2006-03-23 03:00:32 -0800	[diff] [blame]	865	* holding "epmutex" is sufficent here.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	866	*/
Arjan van de Ven	144efe3	2006-03-23 03:00:32 -0800	[diff] [blame]	867	mutex_lock(&epmutex);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	868
				869	/*
				870	* Walks through the whole tree by unregistering poll callbacks.
				871	*/
				872	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
				873	epi = rb_entry(rbp, struct epitem, rbn);
				874
				875	ep_unregister_pollwait(ep, epi);
				876	}
				877
				878	/*
				879	* Walks through the whole hash by freeing each "struct epitem". At this
				880	* point we are sure no poll callbacks will be lingering around, and also by
				881	* write-holding "sem" we can be sure that no file cleanup code will hit
				882	* us during this operation. So we can avoid the lock on "ep->lock".
				883	*/
				884	while ((rbp = rb_first(&ep->rbr)) != 0) {
				885	epi = rb_entry(rbp, struct epitem, rbn);
				886	ep_remove(ep, epi);
				887	}
				888
Arjan van de Ven	144efe3	2006-03-23 03:00:32 -0800	[diff] [blame]	889	mutex_unlock(&epmutex);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	890	}
				891
				892
				893	/*
				894	* Search the file inside the eventpoll hash. It add usage count to
				895	* the returned item, so the caller must call ep_release_epitem()
				896	* after finished using the "struct epitem".
				897	*/
				898	static struct epitem ep_find(struct eventpoll ep, struct file *file, int fd)
				899	{
				900	int kcmp;
				901	unsigned long flags;
				902	struct rb_node *rbp;
				903	struct epitem epi, epir = NULL;
				904	struct epoll_filefd ffd;
				905
Pekka Enberg	b030a4d	2005-06-23 00:10:03 -0700	[diff] [blame]	906	ep_set_ffd(&ffd, file, fd);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	907	read_lock_irqsave(&ep->lock, flags);
				908	for (rbp = ep->rbr.rb_node; rbp; ) {
				909	epi = rb_entry(rbp, struct epitem, rbn);
Pekka Enberg	b030a4d	2005-06-23 00:10:03 -0700	[diff] [blame]	910	kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	911	if (kcmp > 0)
				912	rbp = rbp->rb_right;
				913	else if (kcmp < 0)
				914	rbp = rbp->rb_left;
				915	else {
				916	ep_use_epitem(epi);
				917	epir = epi;
				918	break;
				919	}
				920	}
				921	read_unlock_irqrestore(&ep->lock, flags);
				922
				923	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_find(%p) -> %p\n",
				924	current, file, epir));
				925
				926	return epir;
				927	}
				928
				929
				930	/*
				931	* Increment the usage count of the "struct epitem" making it sure
				932	* that the user will have a valid pointer to reference.
				933	*/
				934	static void ep_use_epitem(struct epitem *epi)
				935	{
				936
				937	atomic_inc(&epi->usecnt);
				938	}
				939
				940
				941	/*
				942	* Decrement ( release ) the usage count by signaling that the user
				943	* has finished using the structure. It might lead to freeing the
				944	* structure itself if the count goes to zero.
				945	*/
				946	static void ep_release_epitem(struct epitem *epi)
				947	{
				948
				949	if (atomic_dec_and_test(&epi->usecnt))
Pekka Enberg	b030a4d	2005-06-23 00:10:03 -0700	[diff] [blame]	950	kmem_cache_free(epi_cache, epi);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	951	}
				952
				953
				954	/*
				955	* This is the callback that is used to add our wait queue to the
				956	* target file wakeup lists.
				957	*/
				958	static void ep_ptable_queue_proc(struct file file, wait_queue_head_t whead,
				959	poll_table *pt)
				960	{
Pekka Enberg	b030a4d	2005-06-23 00:10:03 -0700	[diff] [blame]	961	struct epitem *epi = ep_item_from_epqueue(pt);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	962	struct eppoll_entry *pwq;
				963
Christoph Lameter	e94b176	2006-12-06 20:33:17 -0800	[diff] [blame]	964	if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	965	init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
				966	pwq->whead = whead;
				967	pwq->base = epi;
				968	add_wait_queue(whead, &pwq->wait);
				969	list_add_tail(&pwq->llink, &epi->pwqlist);
				970	epi->nwait++;
				971	} else {
				972	/* We have to signal that an error occurred */
				973	epi->nwait = -1;
				974	}
				975	}
				976
				977
				978	static void ep_rbtree_insert(struct eventpoll ep, struct epitem epi)
				979	{
				980	int kcmp;
				981	struct rb_node *p = &ep->rbr.rb_node, parent = NULL;
				982	struct epitem *epic;
				983
				984	while (*p) {
				985	parent = *p;
				986	epic = rb_entry(parent, struct epitem, rbn);
Pekka Enberg	b030a4d	2005-06-23 00:10:03 -0700	[diff] [blame]	987	kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	988	if (kcmp > 0)
				989	p = &parent->rb_right;
				990	else
				991	p = &parent->rb_left;
				992	}
				993	rb_link_node(&epi->rbn, parent, p);
				994	rb_insert_color(&epi->rbn, &ep->rbr);
				995	}
				996
				997
				998	static int ep_insert(struct eventpoll ep, struct epoll_event event,
				999	struct file *tfile, int fd)
				1000	{
				1001	int error, revents, pwake = 0;
				1002	unsigned long flags;
				1003	struct epitem *epi;
				1004	struct ep_pqueue epq;
				1005
				1006	error = -ENOMEM;
Christoph Lameter	e94b176	2006-12-06 20:33:17 -0800	[diff] [blame]	1007	if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1008	goto eexit_1;
				1009
				1010	/* Item initialization follow here ... */
Pekka Enberg	b030a4d	2005-06-23 00:10:03 -0700	[diff] [blame]	1011	ep_rb_initnode(&epi->rbn);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1012	INIT_LIST_HEAD(&epi->rdllink);
				1013	INIT_LIST_HEAD(&epi->fllink);
				1014	INIT_LIST_HEAD(&epi->txlink);
				1015	INIT_LIST_HEAD(&epi->pwqlist);
				1016	epi->ep = ep;
Pekka Enberg	b030a4d	2005-06-23 00:10:03 -0700	[diff] [blame]	1017	ep_set_ffd(&epi->ffd, tfile, fd);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1018	epi->event = *event;
				1019	atomic_set(&epi->usecnt, 1);
				1020	epi->nwait = 0;
				1021
				1022	/* Initialize the poll table using the queue callback */
				1023	epq.epi = epi;
				1024	init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
				1025
				1026	/*
				1027	* Attach the item to the poll hooks and get current event bits.
				1028	* We can safely use the file* here because its usage count has
				1029	* been increased by the caller of this function.
				1030	*/
				1031	revents = tfile->f_op->poll(tfile, &epq.pt);
				1032
				1033	/*
				1034	* We have to check if something went wrong during the poll wait queue
				1035	* install process. Namely an allocation for a wait queue failed due
				1036	* high memory pressure.
				1037	*/
				1038	if (epi->nwait < 0)
				1039	goto eexit_2;
				1040
				1041	/* Add the current item to the list of active epoll hook for this file */
				1042	spin_lock(&tfile->f_ep_lock);
				1043	list_add_tail(&epi->fllink, &tfile->f_ep_links);
				1044	spin_unlock(&tfile->f_ep_lock);
				1045
				1046	/* We have to drop the new item inside our item list to keep track of it */
				1047	write_lock_irqsave(&ep->lock, flags);
				1048
				1049	/* Add the current item to the rb-tree */
				1050	ep_rbtree_insert(ep, epi);
				1051
				1052	/* If the file is already "ready" we drop it inside the ready list */
Pekka Enberg	b030a4d	2005-06-23 00:10:03 -0700	[diff] [blame]	1053	if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1054	list_add_tail(&epi->rdllink, &ep->rdllist);
				1055
				1056	/* Notify waiting tasks that events are available */
				1057	if (waitqueue_active(&ep->wq))
Davide Libenzi	3419b23	2006-06-25 05:48:14 -0700	[diff] [blame]	1058	__wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE \| TASK_INTERRUPTIBLE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1059	if (waitqueue_active(&ep->poll_wait))
				1060	pwake++;
				1061	}
				1062
				1063	write_unlock_irqrestore(&ep->lock, flags);
				1064
				1065	/* We have to call this outside the lock */
				1066	if (pwake)
				1067	ep_poll_safewake(&psw, &ep->poll_wait);
				1068
				1069	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",
				1070	current, ep, tfile, fd));
				1071
				1072	return 0;
				1073
				1074	eexit_2:
				1075	ep_unregister_pollwait(ep, epi);
				1076
				1077	/*
				1078	* We need to do this because an event could have been arrived on some
				1079	* allocated wait queue.
				1080	*/
				1081	write_lock_irqsave(&ep->lock, flags);
Pekka Enberg	b030a4d	2005-06-23 00:10:03 -0700	[diff] [blame]	1082	if (ep_is_linked(&epi->rdllink))
				1083	ep_list_del(&epi->rdllink);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1084	write_unlock_irqrestore(&ep->lock, flags);
				1085
Pekka Enberg	b030a4d	2005-06-23 00:10:03 -0700	[diff] [blame]	1086	kmem_cache_free(epi_cache, epi);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1087	eexit_1:
				1088	return error;
				1089	}
				1090
				1091
				1092	/*
				1093	* Modify the interest event mask by dropping an event if the new mask
				1094	* has a match in the current file status.
				1095	*/
				1096	static int ep_modify(struct eventpoll ep, struct epitem epi, struct epoll_event *event)
				1097	{
				1098	int pwake = 0;
				1099	unsigned int revents;
				1100	unsigned long flags;
				1101
				1102	/*
				1103	* Set the new event interest mask before calling f_op->poll(), otherwise
				1104	* a potential race might occur. In fact if we do this operation inside
				1105	* the lock, an event might happen between the f_op->poll() call and the
				1106	* new event set registering.
				1107	*/
				1108	epi->event.events = event->events;
				1109
				1110	/*
				1111	* Get current event bits. We can safely use the file* here because
				1112	* its usage count has been increased by the caller of this function.
				1113	*/
				1114	revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
				1115
				1116	write_lock_irqsave(&ep->lock, flags);
				1117
				1118	/* Copy the data member from inside the lock */
				1119	epi->event.data = event->data;
				1120
				1121	/*
				1122	* If the item is not linked to the hash it means that it's on its
				1123	* way toward the removal. Do nothing in this case.
				1124	*/
Pekka Enberg	b030a4d	2005-06-23 00:10:03 -0700	[diff] [blame]	1125	if (ep_rb_linked(&epi->rbn)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1126	/*
				1127	* If the item is "hot" and it is not registered inside the ready
				1128	* list, push it inside. If the item is not "hot" and it is currently
				1129	* registered inside the ready list, unlink it.
				1130	*/
				1131	if (revents & event->events) {
Pekka Enberg	b030a4d	2005-06-23 00:10:03 -0700	[diff] [blame]	1132	if (!ep_is_linked(&epi->rdllink)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1133	list_add_tail(&epi->rdllink, &ep->rdllist);
				1134
				1135	/* Notify waiting tasks that events are available */
				1136	if (waitqueue_active(&ep->wq))
Davide Libenzi	3419b23	2006-06-25 05:48:14 -0700	[diff] [blame]	1137	__wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE \|
				1138	TASK_INTERRUPTIBLE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1139	if (waitqueue_active(&ep->poll_wait))
				1140	pwake++;
				1141	}
				1142	}
				1143	}
				1144
				1145	write_unlock_irqrestore(&ep->lock, flags);
				1146
				1147	/* We have to call this outside the lock */
				1148	if (pwake)
				1149	ep_poll_safewake(&psw, &ep->poll_wait);
				1150
				1151	return 0;
				1152	}
				1153
				1154
				1155	/*
				1156	* This function unregister poll callbacks from the associated file descriptor.
				1157	* Since this must be called without holding "ep->lock" the atomic exchange trick
				1158	* will protect us from multiple unregister.
				1159	*/
				1160	static void ep_unregister_pollwait(struct eventpoll ep, struct epitem epi)
				1161	{
				1162	int nwait;
				1163	struct list_head *lsthead = &epi->pwqlist;
				1164	struct eppoll_entry *pwq;
				1165
				1166	/* This is called without locks, so we need the atomic exchange */
				1167	nwait = xchg(&epi->nwait, 0);
				1168
				1169	if (nwait) {
				1170	while (!list_empty(lsthead)) {
				1171	pwq = list_entry(lsthead->next, struct eppoll_entry, llink);
				1172
Pekka Enberg	b030a4d	2005-06-23 00:10:03 -0700	[diff] [blame]	1173	ep_list_del(&pwq->llink);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1174	remove_wait_queue(pwq->whead, &pwq->wait);
Pekka Enberg	b030a4d	2005-06-23 00:10:03 -0700	[diff] [blame]	1175	kmem_cache_free(pwq_cache, pwq);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1176	}
				1177	}
				1178	}
				1179
				1180
				1181	/*
				1182	* Unlink the "struct epitem" from all places it might have been hooked up.
				1183	* This function must be called with write IRQ lock on "ep->lock".
				1184	*/
				1185	static int ep_unlink(struct eventpoll ep, struct epitem epi)
				1186	{
				1187	int error;
				1188
				1189	/*
				1190	* It can happen that this one is called for an item already unlinked.
				1191	* The check protect us from doing a double unlink ( crash ).
				1192	*/
				1193	error = -ENOENT;
Pekka Enberg	b030a4d	2005-06-23 00:10:03 -0700	[diff] [blame]	1194	if (!ep_rb_linked(&epi->rbn))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1195	goto eexit_1;
				1196
				1197	/*
				1198	* Clear the event mask for the unlinked item. This will avoid item
				1199	* notifications to be sent after the unlink operation from inside
				1200	* the kernel->userspace event transfer loop.
				1201	*/
				1202	epi->event.events = 0;
				1203
				1204	/*
				1205	* At this point is safe to do the job, unlink the item from our rb-tree.
				1206	* This operation togheter with the above check closes the door to
				1207	* double unlinks.
				1208	*/
Pekka Enberg	b030a4d	2005-06-23 00:10:03 -0700	[diff] [blame]	1209	ep_rb_erase(&epi->rbn, &ep->rbr);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1210
				1211	/*
				1212	* If the item we are going to remove is inside the ready file descriptors
				1213	* we want to remove it from this list to avoid stale events.
				1214	*/
Pekka Enberg	b030a4d	2005-06-23 00:10:03 -0700	[diff] [blame]	1215	if (ep_is_linked(&epi->rdllink))
				1216	ep_list_del(&epi->rdllink);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1217
				1218	error = 0;
				1219	eexit_1:
				1220
				1221	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_unlink(%p, %p) = %d\n",
Masoud Asgharifard Sharbiani	45f17e0	2006-08-27 01:23:48 -0700	[diff] [blame]	1222	current, ep, epi->ffd.file, error));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1223
				1224	return error;
				1225	}
				1226
				1227
				1228	/*
				1229	* Removes a "struct epitem" from the eventpoll hash and deallocates
				1230	* all the associated resources.
				1231	*/
				1232	static int ep_remove(struct eventpoll ep, struct epitem epi)
				1233	{
				1234	int error;
				1235	unsigned long flags;
				1236	struct file *file = epi->ffd.file;
				1237
				1238	/*
				1239	* Removes poll wait queue hooks. We _have_ to do this without holding
				1240	* the "ep->lock" otherwise a deadlock might occur. This because of the
				1241	* sequence of the lock acquisition. Here we do "ep->lock" then the wait
				1242	* queue head lock when unregistering the wait queue. The wakeup callback
				1243	* will run by holding the wait queue head lock and will call our callback
				1244	* that will try to get "ep->lock".
				1245	*/
				1246	ep_unregister_pollwait(ep, epi);
				1247
				1248	/* Remove the current item from the list of epoll hooks */
				1249	spin_lock(&file->f_ep_lock);
Pekka Enberg	b030a4d	2005-06-23 00:10:03 -0700	[diff] [blame]	1250	if (ep_is_linked(&epi->fllink))
				1251	ep_list_del(&epi->fllink);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1252	spin_unlock(&file->f_ep_lock);
				1253
				1254	/* We need to acquire the write IRQ lock before calling ep_unlink() */
				1255	write_lock_irqsave(&ep->lock, flags);
				1256
				1257	/* Really unlink the item from the hash */
				1258	error = ep_unlink(ep, epi);
				1259
				1260	write_unlock_irqrestore(&ep->lock, flags);
				1261
				1262	if (error)
				1263	goto eexit_1;
				1264
				1265	/* At this point it is safe to free the eventpoll item */
				1266	ep_release_epitem(epi);
				1267
				1268	error = 0;
				1269	eexit_1:
				1270	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p) = %d\n",
				1271	current, ep, file, error));
				1272
				1273	return error;
				1274	}
				1275
				1276
				1277	/*
				1278	* This is the callback that is passed to the wait queue wakeup
				1279	* machanism. It is called by the stored file descriptors when they
				1280	* have events to report.
				1281	*/
				1282	static int ep_poll_callback(wait_queue_t wait, unsigned mode, int sync, void key)
				1283	{
				1284	int pwake = 0;
				1285	unsigned long flags;
Pekka Enberg	b030a4d	2005-06-23 00:10:03 -0700	[diff] [blame]	1286	struct epitem *epi = ep_item_from_wait(wait);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1287	struct eventpoll *ep = epi->ep;
				1288
				1289	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n",
Masoud Asgharifard Sharbiani	45f17e0	2006-08-27 01:23:48 -0700	[diff] [blame]	1290	current, epi->ffd.file, epi, ep));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1291
				1292	write_lock_irqsave(&ep->lock, flags);
				1293
				1294	/*
				1295	* If the event mask does not contain any poll(2) event, we consider the
				1296	* descriptor to be disabled. This condition is likely the effect of the
				1297	* EPOLLONESHOT bit that disables the descriptor when an event is received,
				1298	* until the next EPOLL_CTL_MOD will be issued.
				1299	*/
				1300	if (!(epi->event.events & ~EP_PRIVATE_BITS))
				1301	goto is_disabled;
				1302
				1303	/* If this file is already in the ready list we exit soon */
Pekka Enberg	b030a4d	2005-06-23 00:10:03 -0700	[diff] [blame]	1304	if (ep_is_linked(&epi->rdllink))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1305	goto is_linked;
				1306
				1307	list_add_tail(&epi->rdllink, &ep->rdllist);
				1308
				1309	is_linked:
				1310	/*
				1311	* Wake up ( if active ) both the eventpoll wait list and the ->poll()
				1312	* wait list.
				1313	*/
				1314	if (waitqueue_active(&ep->wq))
Davide Libenzi	3419b23	2006-06-25 05:48:14 -0700	[diff] [blame]	1315	__wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE \|
				1316	TASK_INTERRUPTIBLE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1317	if (waitqueue_active(&ep->poll_wait))
				1318	pwake++;
				1319
				1320	is_disabled:
				1321	write_unlock_irqrestore(&ep->lock, flags);
				1322
				1323	/* We have to call this outside the lock */
				1324	if (pwake)
				1325	ep_poll_safewake(&psw, &ep->poll_wait);
				1326
				1327	return 1;
				1328	}
				1329
				1330
				1331	static int ep_eventpoll_close(struct inode inode, struct file file)
				1332	{
				1333	struct eventpoll *ep = file->private_data;
				1334
				1335	if (ep) {
				1336	ep_free(ep);
				1337	kfree(ep);
				1338	}
				1339
				1340	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: close() ep=%p\n", current, ep));
				1341	return 0;
				1342	}
				1343
				1344
				1345	static unsigned int ep_eventpoll_poll(struct file file, poll_table wait)
				1346	{
				1347	unsigned int pollflags = 0;
				1348	unsigned long flags;
				1349	struct eventpoll *ep = file->private_data;
				1350
				1351	/* Insert inside our poll wait queue */
				1352	poll_wait(file, &ep->poll_wait, wait);
				1353
				1354	/* Check our condition */
				1355	read_lock_irqsave(&ep->lock, flags);
				1356	if (!list_empty(&ep->rdllist))
				1357	pollflags = POLLIN \| POLLRDNORM;
				1358	read_unlock_irqrestore(&ep->lock, flags);
				1359
				1360	return pollflags;
				1361	}
				1362
				1363
				1364	/*
				1365	* Since we have to release the lock during the __copy_to_user() operation and
				1366	* during the f_op->poll() call, we try to collect the maximum number of items
				1367	* by reducing the irqlock/irqunlock switching rate.
				1368	*/
				1369	static int ep_collect_ready_items(struct eventpoll ep, struct list_head txlist, int maxevents)
				1370	{
				1371	int nepi;
				1372	unsigned long flags;
				1373	struct list_head lsthead = &ep->rdllist, lnk;
				1374	struct epitem *epi;
				1375
				1376	write_lock_irqsave(&ep->lock, flags);
				1377
				1378	for (nepi = 0, lnk = lsthead->next; lnk != lsthead && nepi < maxevents;) {
				1379	epi = list_entry(lnk, struct epitem, rdllink);
				1380
				1381	lnk = lnk->next;
				1382
				1383	/* If this file is already in the ready list we exit soon */
Pekka Enberg	b030a4d	2005-06-23 00:10:03 -0700	[diff] [blame]	1384	if (!ep_is_linked(&epi->txlink)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1385	/*
				1386	* This is initialized in this way so that the default
				1387	* behaviour of the reinjecting code will be to push back
				1388	* the item inside the ready list.
				1389	*/
				1390	epi->revents = epi->event.events;
				1391
				1392	/* Link the ready item into the transfer list */
				1393	list_add(&epi->txlink, txlist);
				1394	nepi++;
				1395
				1396	/*
				1397	* Unlink the item from the ready list.
				1398	*/
Pekka Enberg	b030a4d	2005-06-23 00:10:03 -0700	[diff] [blame]	1399	ep_list_del(&epi->rdllink);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1400	}
				1401	}
				1402
				1403	write_unlock_irqrestore(&ep->lock, flags);
				1404
				1405	return nepi;
				1406	}
				1407
				1408
				1409	/*
				1410	* This function is called without holding the "ep->lock" since the call to
				1411	* __copy_to_user() might sleep, and also f_op->poll() might reenable the IRQ
				1412	* because of the way poll() is traditionally implemented in Linux.
				1413	*/
				1414	static int ep_send_events(struct eventpoll ep, struct list_head txlist,
				1415	struct epoll_event __user *events)
				1416	{
				1417	int eventcnt = 0;
				1418	unsigned int revents;
				1419	struct list_head *lnk;
				1420	struct epitem *epi;
				1421
				1422	/*
				1423	* We can loop without lock because this is a task private list.
				1424	* The test done during the collection loop will guarantee us that
				1425	* another task will not try to collect this file. Also, items
				1426	* cannot vanish during the loop because we are holding "sem".
				1427	*/
				1428	list_for_each(lnk, txlist) {
				1429	epi = list_entry(lnk, struct epitem, txlink);
				1430
				1431	/*
				1432	* Get the ready file event set. We can safely use the file
				1433	* because we are holding the "sem" in read and this will
				1434	* guarantee that both the file and the item will not vanish.
				1435	*/
				1436	revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
				1437
				1438	/*
				1439	* Set the return event set for the current file descriptor.
				1440	* Note that only the task task was successfully able to link
				1441	* the item to its "txlist" will write this field.
				1442	*/
				1443	epi->revents = revents & epi->event.events;
				1444
				1445	if (epi->revents) {
				1446	if (__put_user(epi->revents,
				1447	&events[eventcnt].events) \|\|
				1448	__put_user(epi->event.data,
				1449	&events[eventcnt].data))
				1450	return -EFAULT;
				1451	if (epi->event.events & EPOLLONESHOT)
				1452	epi->event.events &= EP_PRIVATE_BITS;
				1453	eventcnt++;
				1454	}
				1455	}
				1456	return eventcnt;
				1457	}
				1458
				1459
				1460	/*
				1461	* Walk through the transfer list we collected with ep_collect_ready_items()
				1462	* and, if 1) the item is still "alive" 2) its event set is not empty 3) it's
				1463	* not already linked, links it to the ready list. Same as above, we are holding
				1464	* "sem" so items cannot vanish underneath our nose.
				1465	*/
				1466	static void ep_reinject_items(struct eventpoll ep, struct list_head txlist)
				1467	{
				1468	int ricnt = 0, pwake = 0;
				1469	unsigned long flags;
				1470	struct epitem *epi;
				1471
				1472	write_lock_irqsave(&ep->lock, flags);
				1473
				1474	while (!list_empty(txlist)) {
				1475	epi = list_entry(txlist->next, struct epitem, txlink);
				1476
				1477	/* Unlink the current item from the transfer list */
Pekka Enberg	b030a4d	2005-06-23 00:10:03 -0700	[diff] [blame]	1478	ep_list_del(&epi->txlink);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1479
				1480	/*
				1481	* If the item is no more linked to the interest set, we don't
				1482	* have to push it inside the ready list because the following
				1483	* ep_release_epitem() is going to drop it. Also, if the current
				1484	* item is set to have an Edge Triggered behaviour, we don't have
				1485	* to push it back either.
				1486	*/
Pekka Enberg	b030a4d	2005-06-23 00:10:03 -0700	[diff] [blame]	1487	if (ep_rb_linked(&epi->rbn) && !(epi->event.events & EPOLLET) &&
				1488	(epi->revents & epi->event.events) && !ep_is_linked(&epi->rdllink)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1489	list_add_tail(&epi->rdllink, &ep->rdllist);
				1490	ricnt++;
				1491	}
				1492	}
				1493
				1494	if (ricnt) {
				1495	/*
				1496	* Wake up ( if active ) both the eventpoll wait list and the ->poll()
				1497	* wait list.
				1498	*/
				1499	if (waitqueue_active(&ep->wq))
Davide Libenzi	3419b23	2006-06-25 05:48:14 -0700	[diff] [blame]	1500	__wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE \|
				1501	TASK_INTERRUPTIBLE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1502	if (waitqueue_active(&ep->poll_wait))
				1503	pwake++;
				1504	}
				1505
				1506	write_unlock_irqrestore(&ep->lock, flags);
				1507
				1508	/* We have to call this outside the lock */
				1509	if (pwake)
				1510	ep_poll_safewake(&psw, &ep->poll_wait);
				1511	}
				1512
				1513
				1514	/*
				1515	* Perform the transfer of events to user space.
				1516	*/
				1517	static int ep_events_transfer(struct eventpoll *ep,
				1518	struct epoll_event __user *events, int maxevents)
				1519	{
				1520	int eventcnt = 0;
				1521	struct list_head txlist;
				1522
				1523	INIT_LIST_HEAD(&txlist);
				1524
				1525	/*
				1526	* We need to lock this because we could be hit by
				1527	* eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL).
				1528	*/
				1529	down_read(&ep->sem);
				1530
				1531	/* Collect/extract ready items */
				1532	if (ep_collect_ready_items(ep, &txlist, maxevents) > 0) {
				1533	/* Build result set in userspace */
				1534	eventcnt = ep_send_events(ep, &txlist, events);
				1535
				1536	/* Reinject ready items into the ready list */
				1537	ep_reinject_items(ep, &txlist);
				1538	}
				1539
				1540	up_read(&ep->sem);
				1541
				1542	return eventcnt;
				1543	}
				1544
				1545
				1546	static int ep_poll(struct eventpoll ep, struct epoll_event __user events,
				1547	int maxevents, long timeout)
				1548	{
				1549	int res, eavail;
				1550	unsigned long flags;
				1551	long jtimeout;
				1552	wait_queue_t wait;
				1553
				1554	/*
				1555	* Calculate the timeout by checking for the "infinite" value ( -1 )
				1556	* and the overflow condition. The passed timeout is in milliseconds,
				1557	* that why (t * HZ) / 1000.
				1558	*/
Davide Libenzi	e3306dd	2005-09-27 21:45:33 -0700	[diff] [blame]	1559	jtimeout = (timeout < 0 \|\| timeout >= EP_MAX_MSTIMEO) ?
				1560	MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1561
				1562	retry:
				1563	write_lock_irqsave(&ep->lock, flags);
				1564
				1565	res = 0;
				1566	if (list_empty(&ep->rdllist)) {
				1567	/*
				1568	* We don't have any available event to return to the caller.
				1569	* We need to sleep here, and we will be wake up by
				1570	* ep_poll_callback() when events will become available.
				1571	*/
				1572	init_waitqueue_entry(&wait, current);
Davide Libenzi	3419b23	2006-06-25 05:48:14 -0700	[diff] [blame]	1573	__add_wait_queue(&ep->wq, &wait);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1574
				1575	for (;;) {
				1576	/*
				1577	* We don't want to sleep if the ep_poll_callback() sends us
				1578	* a wakeup in between. That's why we set the task state
				1579	* to TASK_INTERRUPTIBLE before doing the checks.
				1580	*/
				1581	set_current_state(TASK_INTERRUPTIBLE);
				1582	if (!list_empty(&ep->rdllist) \|\| !jtimeout)
				1583	break;
				1584	if (signal_pending(current)) {
				1585	res = -EINTR;
				1586	break;
				1587	}
				1588
				1589	write_unlock_irqrestore(&ep->lock, flags);
				1590	jtimeout = schedule_timeout(jtimeout);
				1591	write_lock_irqsave(&ep->lock, flags);
				1592	}
Davide Libenzi	3419b23	2006-06-25 05:48:14 -0700	[diff] [blame]	1593	__remove_wait_queue(&ep->wq, &wait);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1594
				1595	set_current_state(TASK_RUNNING);
				1596	}
				1597
				1598	/* Is it worth to try to dig for events ? */
				1599	eavail = !list_empty(&ep->rdllist);
				1600
				1601	write_unlock_irqrestore(&ep->lock, flags);
				1602
				1603	/*
				1604	* Try to transfer events to user space. In case we get 0 events and
				1605	* there's still timeout left over, we go trying again in search of
				1606	* more luck.
				1607	*/
				1608	if (!res && eavail &&
				1609	!(res = ep_events_transfer(ep, events, maxevents)) && jtimeout)
				1610	goto retry;
				1611
				1612	return res;
				1613	}
				1614
				1615
				1616	static int eventpollfs_delete_dentry(struct dentry *dentry)
				1617	{
				1618
				1619	return 1;
				1620	}
				1621
				1622
				1623	static struct inode *ep_eventpoll_inode(void)
				1624	{
				1625	int error = -ENOMEM;
				1626	struct inode *inode = new_inode(eventpoll_mnt->mnt_sb);
				1627
				1628	if (!inode)
				1629	goto eexit_1;
				1630
				1631	inode->i_fop = &eventpoll_fops;
				1632
				1633	/*
				1634	* Mark the inode dirty from the very beginning,
				1635	* that way it will never be moved to the dirty
				1636	* list because mark_inode_dirty() will think
				1637	* that it already _is_ on the dirty list.
				1638	*/
				1639	inode->i_state = I_DIRTY;
				1640	inode->i_mode = S_IRUSR \| S_IWUSR;
				1641	inode->i_uid = current->fsuid;
				1642	inode->i_gid = current->fsgid;
				1643	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1644	return inode;
				1645
				1646	eexit_1:
				1647	return ERR_PTR(error);
				1648	}
				1649
				1650
David Howells	454e239	2006-06-23 02:02:57 -0700	[diff] [blame]	1651	static int
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1652	eventpollfs_get_sb(struct file_system_type *fs_type, int flags,
David Howells	454e239	2006-06-23 02:02:57 -0700	[diff] [blame]	1653	const char dev_name, void data, struct vfsmount *mnt)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1654	{
David Howells	454e239	2006-06-23 02:02:57 -0700	[diff] [blame]	1655	return get_sb_pseudo(fs_type, "eventpoll:", NULL, EVENTPOLLFS_MAGIC,
				1656	mnt);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1657	}
				1658
				1659
				1660	static int __init eventpoll_init(void)
				1661	{
				1662	int error;
				1663
Arjan van de Ven	144efe3	2006-03-23 03:00:32 -0800	[diff] [blame]	1664	mutex_init(&epmutex);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1665
				1666	/* Initialize the structure used to perform safe poll wait head wake ups */
				1667	ep_poll_safewake_init(&psw);
				1668
				1669	/* Allocates slab cache used to allocate "struct epitem" items */
				1670	epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
				1671	0, SLAB_HWCACHE_ALIGN\|EPI_SLAB_DEBUG\|SLAB_PANIC,
				1672	NULL, NULL);
				1673
				1674	/* Allocates slab cache used to allocate "struct eppoll_entry" */
				1675	pwq_cache = kmem_cache_create("eventpoll_pwq",
				1676	sizeof(struct eppoll_entry), 0,
				1677	EPI_SLAB_DEBUG\|SLAB_PANIC, NULL, NULL);
				1678
				1679	/*
				1680	* Register the virtual file system that will be the source of inodes
				1681	* for the eventpoll files
				1682	*/
				1683	error = register_filesystem(&eventpoll_fs_type);
				1684	if (error)
				1685	goto epanic;
				1686
				1687	/* Mount the above commented virtual file system */
				1688	eventpoll_mnt = kern_mount(&eventpoll_fs_type);
				1689	error = PTR_ERR(eventpoll_mnt);
				1690	if (IS_ERR(eventpoll_mnt))
				1691	goto epanic;
				1692
				1693	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: successfully initialized.\n",
				1694	current));
				1695	return 0;
				1696
				1697	epanic:
				1698	panic("eventpoll_init() failed\n");
				1699	}
				1700
				1701
				1702	static void __exit eventpoll_exit(void)
				1703	{
				1704	/* Undo all operations done inside eventpoll_init() */
				1705	unregister_filesystem(&eventpoll_fs_type);
				1706	mntput(eventpoll_mnt);
				1707	kmem_cache_destroy(pwq_cache);
				1708	kmem_cache_destroy(epi_cache);
				1709	}
				1710
				1711	module_init(eventpoll_init);
				1712	module_exit(eventpoll_exit);
				1713
				1714	MODULE_LICENSE("GPL");