| Arnaldo Carvalho de Melo | 304a161 | 2005-08-09 19:59:20 -0700 | [diff] [blame] | 1 | /* | 
|  | 2 | * INET		An implementation of the TCP/IP protocol suite for the LINUX | 
|  | 3 | *		operating system.  INET is implemented using the BSD Socket | 
|  | 4 | *		interface as the means of communication with the user level. | 
|  | 5 | * | 
|  | 6 | * Authors:	Lotsa people, from code originally in tcp | 
|  | 7 | * | 
|  | 8 | *	This program is free software; you can redistribute it and/or | 
|  | 9 | *      modify it under the terms of the GNU General Public License | 
|  | 10 | *      as published by the Free Software Foundation; either version | 
|  | 11 | *      2 of the License, or (at your option) any later version. | 
|  | 12 | */ | 
|  | 13 |  | 
|  | 14 | #ifndef _INET_HASHTABLES_H | 
|  | 15 | #define _INET_HASHTABLES_H | 
|  | 16 |  | 
| Arnaldo Carvalho de Melo | 2d8c4ce | 2005-08-09 20:07:13 -0700 | [diff] [blame^] | 17 | #include <linux/interrupt.h> | 
| Arnaldo Carvalho de Melo | 77d8bf9 | 2005-08-09 20:00:51 -0700 | [diff] [blame] | 18 | #include <linux/ip.h> | 
|  | 19 | #include <linux/list.h> | 
|  | 20 | #include <linux/slab.h> | 
|  | 21 | #include <linux/spinlock.h> | 
| Arnaldo Carvalho de Melo | 304a161 | 2005-08-09 19:59:20 -0700 | [diff] [blame] | 22 | #include <linux/types.h> | 
|  | 23 |  | 
| Arnaldo Carvalho de Melo | 2d8c4ce | 2005-08-09 20:07:13 -0700 | [diff] [blame^] | 24 | #include <net/sock.h> | 
|  | 25 |  | 
| Arnaldo Carvalho de Melo | 77d8bf9 | 2005-08-09 20:00:51 -0700 | [diff] [blame] | 26 | /* This is for all connections with a full identity, no wildcards. | 
|  | 27 | * New scheme, half the table is for TIME_WAIT, the other half is | 
|  | 28 | * for the rest.  I'll experiment with dynamic table growth later. | 
|  | 29 | */ | 
|  | 30 | struct inet_ehash_bucket { | 
|  | 31 | rwlock_t	  lock; | 
|  | 32 | struct hlist_head chain; | 
|  | 33 | } __attribute__((__aligned__(8))); | 
|  | 34 |  | 
|  | 35 | /* There are a few simple rules, which allow for local port reuse by | 
|  | 36 | * an application.  In essence: | 
|  | 37 | * | 
|  | 38 | *	1) Sockets bound to different interfaces may share a local port. | 
|  | 39 | *	   Failing that, goto test 2. | 
|  | 40 | *	2) If all sockets have sk->sk_reuse set, and none of them are in | 
|  | 41 | *	   TCP_LISTEN state, the port may be shared. | 
|  | 42 | *	   Failing that, goto test 3. | 
|  | 43 | *	3) If all sockets are bound to a specific inet_sk(sk)->rcv_saddr local | 
|  | 44 | *	   address, and none of them are the same, the port may be | 
|  | 45 | *	   shared. | 
|  | 46 | *	   Failing this, the port cannot be shared. | 
|  | 47 | * | 
|  | 48 | * The interesting point, is test #2.  This is what an FTP server does | 
|  | 49 | * all day.  To optimize this case we use a specific flag bit defined | 
|  | 50 | * below.  As we add sockets to a bind bucket list, we perform a | 
|  | 51 | * check of: (newsk->sk_reuse && (newsk->sk_state != TCP_LISTEN)) | 
|  | 52 | * As long as all sockets added to a bind bucket pass this test, | 
|  | 53 | * the flag bit will be set. | 
|  | 54 | * The resulting situation is that tcp_v[46]_verify_bind() can just check | 
|  | 55 | * for this flag bit, if it is set and the socket trying to bind has | 
|  | 56 | * sk->sk_reuse set, we don't even have to walk the owners list at all, | 
|  | 57 | * we return that it is ok to bind this socket to the requested local port. | 
|  | 58 | * | 
|  | 59 | * Sounds like a lot of work, but it is worth it.  In a more naive | 
|  | 60 | * implementation (ie. current FreeBSD etc.) the entire list of ports | 
|  | 61 | * must be walked for each data port opened by an ftp server.  Needless | 
|  | 62 | * to say, this does not scale at all.  With a couple thousand FTP | 
|  | 63 | * users logged onto your box, isn't it nice to know that new data | 
|  | 64 | * ports are created in O(1) time?  I thought so. ;-)	-DaveM | 
|  | 65 | */ | 
|  | 66 | struct inet_bind_bucket { | 
|  | 67 | unsigned short		port; | 
|  | 68 | signed short		fastreuse; | 
|  | 69 | struct hlist_node	node; | 
|  | 70 | struct hlist_head	owners; | 
|  | 71 | }; | 
|  | 72 |  | 
|  | 73 | #define inet_bind_bucket_for_each(tb, node, head) \ | 
|  | 74 | hlist_for_each_entry(tb, node, head, node) | 
|  | 75 |  | 
|  | 76 | struct inet_bind_hashbucket { | 
|  | 77 | spinlock_t		lock; | 
|  | 78 | struct hlist_head	chain; | 
|  | 79 | }; | 
|  | 80 |  | 
|  | 81 | /* This is for listening sockets, thus all sockets which possess wildcards. */ | 
|  | 82 | #define INET_LHTABLE_SIZE	32	/* Yes, really, this is all you need. */ | 
|  | 83 |  | 
|  | 84 | struct inet_hashinfo { | 
|  | 85 | /* This is for sockets with full identity only.  Sockets here will | 
|  | 86 | * always be without wildcards and will have the following invariant: | 
|  | 87 | * | 
|  | 88 | *          TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE | 
|  | 89 | * | 
|  | 90 | * First half of the table is for sockets not in TIME_WAIT, second half | 
|  | 91 | * is for TIME_WAIT sockets only. | 
|  | 92 | */ | 
|  | 93 | struct inet_ehash_bucket	*ehash; | 
|  | 94 |  | 
|  | 95 | /* Ok, let's try this, I give up, we do need a local binding | 
|  | 96 | * TCP hash as well as the others for fast bind/connect. | 
|  | 97 | */ | 
|  | 98 | struct inet_bind_hashbucket	*bhash; | 
|  | 99 |  | 
|  | 100 | int				bhash_size; | 
|  | 101 | int				ehash_size; | 
|  | 102 |  | 
|  | 103 | /* All sockets in TCP_LISTEN state will be in here.  This is the only | 
|  | 104 | * table where wildcard'd TCP sockets can exist.  Hash function here | 
|  | 105 | * is just local port number. | 
|  | 106 | */ | 
|  | 107 | struct hlist_head		listening_hash[INET_LHTABLE_SIZE]; | 
|  | 108 |  | 
|  | 109 | /* All the above members are written once at bootup and | 
|  | 110 | * never written again _or_ are predominantly read-access. | 
|  | 111 | * | 
|  | 112 | * Now align to a new cache line as all the following members | 
|  | 113 | * are often dirty. | 
|  | 114 | */ | 
|  | 115 | rwlock_t			lhash_lock ____cacheline_aligned; | 
|  | 116 | atomic_t			lhash_users; | 
|  | 117 | wait_queue_head_t		lhash_wait; | 
|  | 118 | spinlock_t			portalloc_lock; | 
| Arnaldo Carvalho de Melo | 2d8c4ce | 2005-08-09 20:07:13 -0700 | [diff] [blame^] | 119 | kmem_cache_t			*bind_bucket_cachep; | 
| Arnaldo Carvalho de Melo | 77d8bf9 | 2005-08-09 20:00:51 -0700 | [diff] [blame] | 120 | }; | 
|  | 121 |  | 
| Arnaldo Carvalho de Melo | 304a161 | 2005-08-09 19:59:20 -0700 | [diff] [blame] | 122 | static inline int inet_ehashfn(const __u32 laddr, const __u16 lport, | 
|  | 123 | const __u32 faddr, const __u16 fport, | 
|  | 124 | const int ehash_size) | 
|  | 125 | { | 
|  | 126 | int h = (laddr ^ lport) ^ (faddr ^ fport); | 
|  | 127 | h ^= h >> 16; | 
|  | 128 | h ^= h >> 8; | 
|  | 129 | return h & (ehash_size - 1); | 
|  | 130 | } | 
|  | 131 |  | 
|  | 132 | static inline int inet_sk_ehashfn(const struct sock *sk, const int ehash_size) | 
|  | 133 | { | 
|  | 134 | const struct inet_sock *inet = inet_sk(sk); | 
|  | 135 | const __u32 laddr = inet->rcv_saddr; | 
|  | 136 | const __u16 lport = inet->num; | 
|  | 137 | const __u32 faddr = inet->daddr; | 
|  | 138 | const __u16 fport = inet->dport; | 
|  | 139 |  | 
|  | 140 | return inet_ehashfn(laddr, lport, faddr, fport, ehash_size); | 
|  | 141 | } | 
|  | 142 |  | 
| Arnaldo Carvalho de Melo | 77d8bf9 | 2005-08-09 20:00:51 -0700 | [diff] [blame] | 143 | extern struct inet_bind_bucket * | 
|  | 144 | inet_bind_bucket_create(kmem_cache_t *cachep, | 
|  | 145 | struct inet_bind_hashbucket *head, | 
|  | 146 | const unsigned short snum); | 
|  | 147 | extern void inet_bind_bucket_destroy(kmem_cache_t *cachep, | 
|  | 148 | struct inet_bind_bucket *tb); | 
|  | 149 |  | 
|  | 150 | static inline int inet_bhashfn(const __u16 lport, const int bhash_size) | 
|  | 151 | { | 
|  | 152 | return lport & (bhash_size - 1); | 
|  | 153 | } | 
|  | 154 |  | 
| Arnaldo Carvalho de Melo | 2d8c4ce | 2005-08-09 20:07:13 -0700 | [diff] [blame^] | 155 | extern void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, | 
|  | 156 | const unsigned short snum); | 
|  | 157 |  | 
| Arnaldo Carvalho de Melo | 77d8bf9 | 2005-08-09 20:00:51 -0700 | [diff] [blame] | 158 | /* These can have wildcards, don't try too hard. */ | 
|  | 159 | static inline int inet_lhashfn(const unsigned short num) | 
|  | 160 | { | 
|  | 161 | return num & (INET_LHTABLE_SIZE - 1); | 
|  | 162 | } | 
|  | 163 |  | 
|  | 164 | static inline int inet_sk_listen_hashfn(const struct sock *sk) | 
|  | 165 | { | 
|  | 166 | return inet_lhashfn(inet_sk(sk)->num); | 
|  | 167 | } | 
|  | 168 |  | 
| Arnaldo Carvalho de Melo | 2d8c4ce | 2005-08-09 20:07:13 -0700 | [diff] [blame^] | 169 | /* Caller must disable local BH processing. */ | 
|  | 170 | static inline void __inet_inherit_port(struct inet_hashinfo *table, | 
|  | 171 | struct sock *sk, struct sock *child) | 
|  | 172 | { | 
|  | 173 | const int bhash = inet_bhashfn(inet_sk(child)->num, table->bhash_size); | 
|  | 174 | struct inet_bind_hashbucket *head = &table->bhash[bhash]; | 
|  | 175 | struct inet_bind_bucket *tb; | 
|  | 176 |  | 
|  | 177 | spin_lock(&head->lock); | 
|  | 178 | tb = inet_sk(sk)->bind_hash; | 
|  | 179 | sk_add_bind_node(child, &tb->owners); | 
|  | 180 | inet_sk(child)->bind_hash = tb; | 
|  | 181 | spin_unlock(&head->lock); | 
|  | 182 | } | 
|  | 183 |  | 
|  | 184 | static inline void inet_inherit_port(struct inet_hashinfo *table, | 
|  | 185 | struct sock *sk, struct sock *child) | 
|  | 186 | { | 
|  | 187 | local_bh_disable(); | 
|  | 188 | __inet_inherit_port(table, sk, child); | 
|  | 189 | local_bh_enable(); | 
|  | 190 | } | 
|  | 191 |  | 
|  | 192 | extern void inet_put_port(struct inet_hashinfo *table, struct sock *sk); | 
|  | 193 |  | 
| Arnaldo Carvalho de Melo | 304a161 | 2005-08-09 19:59:20 -0700 | [diff] [blame] | 194 | #endif /* _INET_HASHTABLES_H */ |