blob: 62be2eb37698f953235ffdff626c59f4bf203fc3 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
3 extension. */
4
5/* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 *
12 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13 * - new API and handling of conntrack/nat helpers
14 * - now capable of multiple expectations for one master
15 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16 * - add usage/reference counts to ip_conntrack_expect
17 * - export ip_conntrack[_expect]_{find_get,put} functions
18 * */
19
Linus Torvalds1da177e2005-04-16 15:20:36 -070020#include <linux/types.h>
21#include <linux/icmp.h>
22#include <linux/ip.h>
23#include <linux/netfilter.h>
24#include <linux/netfilter_ipv4.h>
25#include <linux/module.h>
26#include <linux/skbuff.h>
27#include <linux/proc_fs.h>
28#include <linux/vmalloc.h>
29#include <net/checksum.h>
30#include <net/ip.h>
31#include <linux/stddef.h>
32#include <linux/sysctl.h>
33#include <linux/slab.h>
34#include <linux/random.h>
35#include <linux/jhash.h>
36#include <linux/err.h>
37#include <linux/percpu.h>
38#include <linux/moduleparam.h>
Harald Welteac3247b2005-08-09 19:28:03 -070039#include <linux/notifier.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070040
Patrick McHardye45b1be2005-06-21 14:01:30 -070041/* ip_conntrack_lock protects the main hash table, protocol/helper/expected
Linus Torvalds1da177e2005-04-16 15:20:36 -070042 registrations, conntrack timers*/
Linus Torvalds1da177e2005-04-16 15:20:36 -070043#include <linux/netfilter_ipv4/ip_conntrack.h>
44#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
45#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
46#include <linux/netfilter_ipv4/ip_conntrack_core.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070047
Harald Welteeed75f12005-10-16 14:22:59 +020048#define IP_CONNTRACK_VERSION "2.4"
Linus Torvalds1da177e2005-04-16 15:20:36 -070049
50#if 0
51#define DEBUGP printk
52#else
53#define DEBUGP(format, args...)
54#endif
55
Patrick McHardye45b1be2005-06-21 14:01:30 -070056DEFINE_RWLOCK(ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -070057
58/* ip_conntrack_standalone needs this */
59atomic_t ip_conntrack_count = ATOMIC_INIT(0);
60
61void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
62LIST_HEAD(ip_conntrack_expect_list);
Brian Haley1192e402006-09-20 12:03:46 -070063struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO] __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070064static LIST_HEAD(helpers);
Brian Haley94aec082006-09-18 00:05:22 -070065unsigned int ip_conntrack_htable_size __read_mostly = 0;
66int ip_conntrack_max __read_mostly;
Brian Haley1192e402006-09-20 12:03:46 -070067struct list_head *ip_conntrack_hash __read_mostly;
Christoph Lametere18b8902006-12-06 20:33:20 -080068static struct kmem_cache *ip_conntrack_cachep __read_mostly;
69static struct kmem_cache *ip_conntrack_expect_cachep __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070070struct ip_conntrack ip_conntrack_untracked;
Brian Haley94aec082006-09-18 00:05:22 -070071unsigned int ip_ct_log_invalid __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070072static LIST_HEAD(unconfirmed);
Brian Haley1192e402006-09-20 12:03:46 -070073static int ip_conntrack_vmalloc __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070074
Pablo Neira Ayuso4e3882f2006-03-22 13:55:11 -080075static unsigned int ip_conntrack_next_id;
76static unsigned int ip_conntrack_expect_next_id;
Harald Welteac3247b2005-08-09 19:28:03 -070077#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
Alan Sterne041c682006-03-27 01:16:30 -080078ATOMIC_NOTIFIER_HEAD(ip_conntrack_chain);
79ATOMIC_NOTIFIER_HEAD(ip_conntrack_expect_chain);
Harald Welteac3247b2005-08-09 19:28:03 -070080
81DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
82
Patrick McHardya86888b2005-08-09 20:02:13 -070083/* deliver cached events and clear cache entry - must be called with locally
84 * disabled softirqs */
85static inline void
86__ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
Harald Welteac3247b2005-08-09 19:28:03 -070087{
Patrick McHardya86888b2005-08-09 20:02:13 -070088 DEBUGP("ecache: delivering events for %p\n", ecache->ct);
Harald Welteac3247b2005-08-09 19:28:03 -070089 if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
Alan Sterne041c682006-03-27 01:16:30 -080090 atomic_notifier_call_chain(&ip_conntrack_chain, ecache->events,
Harald Welteac3247b2005-08-09 19:28:03 -070091 ecache->ct);
92 ecache->events = 0;
Patrick McHardya86888b2005-08-09 20:02:13 -070093 ip_conntrack_put(ecache->ct);
94 ecache->ct = NULL;
Harald Welteac3247b2005-08-09 19:28:03 -070095}
96
97/* Deliver all cached events for a particular conntrack. This is called
98 * by code prior to async packet handling or freeing the skb */
Patrick McHardya86888b2005-08-09 20:02:13 -070099void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
Harald Welteac3247b2005-08-09 19:28:03 -0700100{
Patrick McHardya86888b2005-08-09 20:02:13 -0700101 struct ip_conntrack_ecache *ecache;
102
103 local_bh_disable();
104 ecache = &__get_cpu_var(ip_conntrack_ecache);
105 if (ecache->ct == ct)
106 __ip_ct_deliver_cached_events(ecache);
107 local_bh_enable();
Harald Welteac3247b2005-08-09 19:28:03 -0700108}
109
Patrick McHardya86888b2005-08-09 20:02:13 -0700110void __ip_ct_event_cache_init(struct ip_conntrack *ct)
Harald Welteac3247b2005-08-09 19:28:03 -0700111{
Patrick McHardya86888b2005-08-09 20:02:13 -0700112 struct ip_conntrack_ecache *ecache;
Harald Welteac3247b2005-08-09 19:28:03 -0700113
114 /* take care of delivering potentially old events */
Patrick McHardya86888b2005-08-09 20:02:13 -0700115 ecache = &__get_cpu_var(ip_conntrack_ecache);
116 BUG_ON(ecache->ct == ct);
117 if (ecache->ct)
118 __ip_ct_deliver_cached_events(ecache);
119 /* initialize for this conntrack/packet */
120 ecache->ct = ct;
121 nf_conntrack_get(&ct->ct_general);
Harald Welteac3247b2005-08-09 19:28:03 -0700122}
123
Patrick McHardya86888b2005-08-09 20:02:13 -0700124/* flush the event cache - touches other CPU's data and must not be called while
125 * packets are still passing through the code */
126static void ip_ct_event_cache_flush(void)
127{
128 struct ip_conntrack_ecache *ecache;
129 int cpu;
130
KAMEZAWA Hiroyuki6f912042006-04-10 22:52:50 -0700131 for_each_possible_cpu(cpu) {
Patrick McHardya86888b2005-08-09 20:02:13 -0700132 ecache = &per_cpu(ip_conntrack_ecache, cpu);
133 if (ecache->ct)
134 ip_conntrack_put(ecache->ct);
135 }
136}
137#else
138static inline void ip_ct_event_cache_flush(void) {}
Harald Welteac3247b2005-08-09 19:28:03 -0700139#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
140
Linus Torvalds1da177e2005-04-16 15:20:36 -0700141DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
142
Linus Torvalds1da177e2005-04-16 15:20:36 -0700143static int ip_conntrack_hash_rnd_initted;
144static unsigned int ip_conntrack_hash_rnd;
145
Harald Welteeed75f12005-10-16 14:22:59 +0200146static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple,
147 unsigned int size, unsigned int rnd)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148{
Al Virocdcb71b2006-09-28 14:21:37 -0700149 return (jhash_3words((__force u32)tuple->src.ip,
150 ((__force u32)tuple->dst.ip ^ tuple->dst.protonum),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700151 (tuple->src.u.all | (tuple->dst.u.all << 16)),
Harald Welteeed75f12005-10-16 14:22:59 +0200152 rnd) % size);
153}
154
155static u_int32_t
156hash_conntrack(const struct ip_conntrack_tuple *tuple)
157{
158 return __hash_conntrack(tuple, ip_conntrack_htable_size,
159 ip_conntrack_hash_rnd);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700160}
161
162int
163ip_ct_get_tuple(const struct iphdr *iph,
164 const struct sk_buff *skb,
165 unsigned int dataoff,
166 struct ip_conntrack_tuple *tuple,
167 const struct ip_conntrack_protocol *protocol)
168{
169 /* Never happen */
170 if (iph->frag_off & htons(IP_OFFSET)) {
171 printk("ip_conntrack_core: Frag of proto %u.\n",
172 iph->protocol);
173 return 0;
174 }
175
176 tuple->src.ip = iph->saddr;
177 tuple->dst.ip = iph->daddr;
178 tuple->dst.protonum = iph->protocol;
179 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
180
181 return protocol->pkt_to_tuple(skb, dataoff, tuple);
182}
183
184int
185ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
186 const struct ip_conntrack_tuple *orig,
187 const struct ip_conntrack_protocol *protocol)
188{
189 inverse->src.ip = orig->dst.ip;
190 inverse->dst.ip = orig->src.ip;
191 inverse->dst.protonum = orig->dst.protonum;
192 inverse->dst.dir = !orig->dst.dir;
193
194 return protocol->invert_tuple(inverse, orig);
195}
196
197
198/* ip_conntrack_expect helper functions */
Pablo Neira Ayuso49719eb2005-09-06 15:10:46 -0700199void ip_ct_unlink_expect(struct ip_conntrack_expect *exp)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700200{
Rusty Russell4acdbdb2005-07-21 13:14:46 -0700201 IP_NF_ASSERT(!timer_pending(&exp->timeout));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700202 list_del(&exp->list);
Rusty Russell4acdbdb2005-07-21 13:14:46 -0700203 CONNTRACK_STAT_INC(expect_delete);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700204 exp->master->expecting--;
Pablo Neira Ayuso37012f72005-08-09 20:06:11 -0700205 ip_conntrack_expect_put(exp);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700206}
207
208static void expectation_timed_out(unsigned long ul_expect)
209{
210 struct ip_conntrack_expect *exp = (void *)ul_expect;
211
Patrick McHardye45b1be2005-06-21 14:01:30 -0700212 write_lock_bh(&ip_conntrack_lock);
Pablo Neira Ayuso49719eb2005-09-06 15:10:46 -0700213 ip_ct_unlink_expect(exp);
Patrick McHardye45b1be2005-06-21 14:01:30 -0700214 write_unlock_bh(&ip_conntrack_lock);
Rusty Russell4acdbdb2005-07-21 13:14:46 -0700215 ip_conntrack_expect_put(exp);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700216}
217
Harald Welte080774a2005-08-09 19:32:58 -0700218struct ip_conntrack_expect *
219__ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
220{
221 struct ip_conntrack_expect *i;
222
223 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
Yasuyuki Kozakai2e47c262006-11-27 10:26:46 -0800224 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask))
Harald Welte080774a2005-08-09 19:32:58 -0700225 return i;
Harald Welte080774a2005-08-09 19:32:58 -0700226 }
227 return NULL;
228}
229
230/* Just find a expectation corresponding to a tuple. */
231struct ip_conntrack_expect *
Yasuyuki Kozakai468ec442006-11-29 02:35:23 +0100232ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
Harald Welte080774a2005-08-09 19:32:58 -0700233{
234 struct ip_conntrack_expect *i;
235
236 read_lock_bh(&ip_conntrack_lock);
237 i = __ip_conntrack_expect_find(tuple);
Yasuyuki Kozakai2e47c262006-11-27 10:26:46 -0800238 if (i)
239 atomic_inc(&i->use);
Harald Welte080774a2005-08-09 19:32:58 -0700240 read_unlock_bh(&ip_conntrack_lock);
241
242 return i;
243}
244
Linus Torvalds1da177e2005-04-16 15:20:36 -0700245/* If an expectation for this connection is found, it gets delete from
246 * global list then returned. */
247static struct ip_conntrack_expect *
248find_expectation(const struct ip_conntrack_tuple *tuple)
249{
250 struct ip_conntrack_expect *i;
251
252 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
253 /* If master is not in hash table yet (ie. packet hasn't left
254 this machine yet), how can other end know about expected?
255 Hence these are not the droids you are looking for (if
256 master ct never got confirmed, we'd hold a reference to it
257 and weird things would happen to future packets). */
258 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
Patrick McHardy2248bcf2005-09-06 15:06:42 -0700259 && is_confirmed(i->master)) {
260 if (i->flags & IP_CT_EXPECT_PERMANENT) {
261 atomic_inc(&i->use);
262 return i;
263 } else if (del_timer(&i->timeout)) {
Pablo Neira Ayuso49719eb2005-09-06 15:10:46 -0700264 ip_ct_unlink_expect(i);
Patrick McHardy2248bcf2005-09-06 15:06:42 -0700265 return i;
266 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700267 }
268 }
269 return NULL;
270}
271
272/* delete all expectations for this conntrack */
Harald Welte080774a2005-08-09 19:32:58 -0700273void ip_ct_remove_expectations(struct ip_conntrack *ct)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700274{
275 struct ip_conntrack_expect *i, *tmp;
276
277 /* Optimization: most connection never expect any others. */
278 if (ct->expecting == 0)
279 return;
280
281 list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
282 if (i->master == ct && del_timer(&i->timeout)) {
Pablo Neira Ayuso49719eb2005-09-06 15:10:46 -0700283 ip_ct_unlink_expect(i);
Rusty Russell4acdbdb2005-07-21 13:14:46 -0700284 ip_conntrack_expect_put(i);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700285 }
286 }
287}
288
289static void
290clean_from_lists(struct ip_conntrack *ct)
291{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700292 DEBUGP("clean_from_lists(%p)\n", ct);
Patrick McHardydf0933d2006-09-20 11:57:53 -0700293 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
294 list_del(&ct->tuplehash[IP_CT_DIR_REPLY].list);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700295
296 /* Destroy all pending expectations */
Harald Welte080774a2005-08-09 19:32:58 -0700297 ip_ct_remove_expectations(ct);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700298}
299
300static void
301destroy_conntrack(struct nf_conntrack *nfct)
302{
303 struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
304 struct ip_conntrack_protocol *proto;
Patrick McHardy4c5de692006-09-20 12:11:30 -0700305 struct ip_conntrack_helper *helper;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700306
307 DEBUGP("destroy_conntrack(%p)\n", ct);
308 IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
309 IP_NF_ASSERT(!timer_pending(&ct->timeout));
310
Pablo Neira Ayuso14a50bb2005-08-09 20:05:52 -0700311 ip_conntrack_event(IPCT_DESTROY, ct);
Harald Welteac3247b2005-08-09 19:28:03 -0700312 set_bit(IPS_DYING_BIT, &ct->status);
313
Patrick McHardy4c5de692006-09-20 12:11:30 -0700314 helper = ct->helper;
315 if (helper && helper->destroy)
316 helper->destroy(ct);
317
Linus Torvalds1da177e2005-04-16 15:20:36 -0700318 /* To make sure we don't get any weird locking issues here:
319 * destroy_conntrack() MUST NOT be called with a write lock
320 * to ip_conntrack_lock!!! -HW */
Harald Welte080774a2005-08-09 19:32:58 -0700321 proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700322 if (proto && proto->destroy)
323 proto->destroy(ct);
324
325 if (ip_conntrack_destroyed)
326 ip_conntrack_destroyed(ct);
327
Patrick McHardye45b1be2005-06-21 14:01:30 -0700328 write_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700329 /* Expectations will have been removed in clean_from_lists,
330 * except TFTP can create an expectation on the first packet,
331 * before connection is in the list, so we need to clean here,
332 * too. */
Harald Welte080774a2005-08-09 19:32:58 -0700333 ip_ct_remove_expectations(ct);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700334
335 /* We overload first tuple to link into unconfirmed list. */
336 if (!is_confirmed(ct)) {
337 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
338 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
339 }
340
341 CONNTRACK_STAT_INC(delete);
Patrick McHardye45b1be2005-06-21 14:01:30 -0700342 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700343
344 if (ct->master)
345 ip_conntrack_put(ct->master);
346
347 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
Harald Welte080774a2005-08-09 19:32:58 -0700348 ip_conntrack_free(ct);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700349}
350
351static void death_by_timeout(unsigned long ul_conntrack)
352{
353 struct ip_conntrack *ct = (void *)ul_conntrack;
354
Patrick McHardye45b1be2005-06-21 14:01:30 -0700355 write_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700356 /* Inside lock so preempt is disabled on module removal path.
357 * Otherwise we can get spurious warnings. */
358 CONNTRACK_STAT_INC(delete_list);
359 clean_from_lists(ct);
Patrick McHardye45b1be2005-06-21 14:01:30 -0700360 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700361 ip_conntrack_put(ct);
362}
363
Harald Welte080774a2005-08-09 19:32:58 -0700364struct ip_conntrack_tuple_hash *
Linus Torvalds1da177e2005-04-16 15:20:36 -0700365__ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
366 const struct ip_conntrack *ignored_conntrack)
367{
368 struct ip_conntrack_tuple_hash *h;
369 unsigned int hash = hash_conntrack(tuple);
370
Linus Torvalds1da177e2005-04-16 15:20:36 -0700371 list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
Patrick McHardydf0933d2006-09-20 11:57:53 -0700372 if (tuplehash_to_ctrack(h) != ignored_conntrack &&
373 ip_ct_tuple_equal(tuple, &h->tuple)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700374 CONNTRACK_STAT_INC(found);
375 return h;
376 }
377 CONNTRACK_STAT_INC(searched);
378 }
379
380 return NULL;
381}
382
383/* Find a connection corresponding to a tuple. */
384struct ip_conntrack_tuple_hash *
385ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
386 const struct ip_conntrack *ignored_conntrack)
387{
388 struct ip_conntrack_tuple_hash *h;
389
Patrick McHardye45b1be2005-06-21 14:01:30 -0700390 read_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700391 h = __ip_conntrack_find(tuple, ignored_conntrack);
392 if (h)
393 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
Patrick McHardye45b1be2005-06-21 14:01:30 -0700394 read_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700395
396 return h;
397}
398
Harald Welte080774a2005-08-09 19:32:58 -0700399static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
400 unsigned int hash,
401 unsigned int repl_hash)
402{
403 ct->id = ++ip_conntrack_next_id;
Patrick McHardydf0933d2006-09-20 11:57:53 -0700404 list_add(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list,
405 &ip_conntrack_hash[hash]);
406 list_add(&ct->tuplehash[IP_CT_DIR_REPLY].list,
407 &ip_conntrack_hash[repl_hash]);
Harald Welte080774a2005-08-09 19:32:58 -0700408}
409
410void ip_conntrack_hash_insert(struct ip_conntrack *ct)
411{
412 unsigned int hash, repl_hash;
413
414 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
415 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
416
417 write_lock_bh(&ip_conntrack_lock);
418 __ip_conntrack_hash_insert(ct, hash, repl_hash);
419 write_unlock_bh(&ip_conntrack_lock);
420}
421
Linus Torvalds1da177e2005-04-16 15:20:36 -0700422/* Confirm a connection given skb; places it in hash table */
423int
424__ip_conntrack_confirm(struct sk_buff **pskb)
425{
426 unsigned int hash, repl_hash;
Patrick McHardydf0933d2006-09-20 11:57:53 -0700427 struct ip_conntrack_tuple_hash *h;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700428 struct ip_conntrack *ct;
429 enum ip_conntrack_info ctinfo;
430
431 ct = ip_conntrack_get(*pskb, &ctinfo);
432
433 /* ipt_REJECT uses ip_conntrack_attach to attach related
434 ICMP/TCP RST packets in other direction. Actual packet
435 which created connection will be IP_CT_NEW or for an
436 expected connection, IP_CT_RELATED. */
437 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
438 return NF_ACCEPT;
439
440 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
441 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
442
443 /* We're not in hash table, and we refuse to set up related
444 connections for unconfirmed conns. But packet copies and
445 REJECT will give spurious warnings here. */
446 /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
447
448 /* No external references means noone else could have
449 confirmed us. */
450 IP_NF_ASSERT(!is_confirmed(ct));
451 DEBUGP("Confirming conntrack %p\n", ct);
452
Patrick McHardye45b1be2005-06-21 14:01:30 -0700453 write_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700454
455 /* See if there's one in the list already, including reverse:
456 NAT could have grabbed it without realizing, since we're
457 not in the hash. If there is, we lost race. */
Patrick McHardydf0933d2006-09-20 11:57:53 -0700458 list_for_each_entry(h, &ip_conntrack_hash[hash], list)
459 if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
460 &h->tuple))
461 goto out;
462 list_for_each_entry(h, &ip_conntrack_hash[repl_hash], list)
463 if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
464 &h->tuple))
465 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700466
Patrick McHardydf0933d2006-09-20 11:57:53 -0700467 /* Remove from unconfirmed list */
468 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
469
470 __ip_conntrack_hash_insert(ct, hash, repl_hash);
471 /* Timer relative to confirmation time, not original
472 setting time, otherwise we'd get timer wrap in
473 weird delay cases. */
474 ct->timeout.expires += jiffies;
475 add_timer(&ct->timeout);
476 atomic_inc(&ct->ct_general.use);
477 set_bit(IPS_CONFIRMED_BIT, &ct->status);
478 CONNTRACK_STAT_INC(insert);
479 write_unlock_bh(&ip_conntrack_lock);
480 if (ct->helper)
481 ip_conntrack_event_cache(IPCT_HELPER, *pskb);
Harald Welteac3247b2005-08-09 19:28:03 -0700482#ifdef CONFIG_IP_NF_NAT_NEEDED
Patrick McHardydf0933d2006-09-20 11:57:53 -0700483 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
484 test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
485 ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
Harald Welteac3247b2005-08-09 19:28:03 -0700486#endif
Patrick McHardydf0933d2006-09-20 11:57:53 -0700487 ip_conntrack_event_cache(master_ct(ct) ?
488 IPCT_RELATED : IPCT_NEW, *pskb);
Harald Welteac3247b2005-08-09 19:28:03 -0700489
Patrick McHardydf0933d2006-09-20 11:57:53 -0700490 return NF_ACCEPT;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700491
Patrick McHardydf0933d2006-09-20 11:57:53 -0700492out:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700493 CONNTRACK_STAT_INC(insert_failed);
Patrick McHardye45b1be2005-06-21 14:01:30 -0700494 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700495 return NF_DROP;
496}
497
498/* Returns true if a connection correspondings to the tuple (required
499 for NAT). */
500int
501ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
502 const struct ip_conntrack *ignored_conntrack)
503{
504 struct ip_conntrack_tuple_hash *h;
505
Patrick McHardye45b1be2005-06-21 14:01:30 -0700506 read_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700507 h = __ip_conntrack_find(tuple, ignored_conntrack);
Patrick McHardye45b1be2005-06-21 14:01:30 -0700508 read_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700509
510 return h != NULL;
511}
512
513/* There's a small race here where we may free a just-assured
514 connection. Too bad: we're in trouble anyway. */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700515static int early_drop(struct list_head *chain)
516{
517 /* Traverse backwards: gives us oldest, which is roughly LRU */
518 struct ip_conntrack_tuple_hash *h;
Patrick McHardydf0933d2006-09-20 11:57:53 -0700519 struct ip_conntrack *ct = NULL, *tmp;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700520 int dropped = 0;
521
Patrick McHardye45b1be2005-06-21 14:01:30 -0700522 read_lock_bh(&ip_conntrack_lock);
Patrick McHardydf0933d2006-09-20 11:57:53 -0700523 list_for_each_entry_reverse(h, chain, list) {
524 tmp = tuplehash_to_ctrack(h);
525 if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) {
526 ct = tmp;
527 atomic_inc(&ct->ct_general.use);
528 break;
529 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700530 }
Patrick McHardye45b1be2005-06-21 14:01:30 -0700531 read_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700532
533 if (!ct)
534 return dropped;
535
536 if (del_timer(&ct->timeout)) {
537 death_by_timeout((unsigned long)ct);
538 dropped = 1;
539 CONNTRACK_STAT_INC(early_drop);
540 }
541 ip_conntrack_put(ct);
542 return dropped;
543}
544
Harald Welte080774a2005-08-09 19:32:58 -0700545static struct ip_conntrack_helper *
546__ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700547{
Patrick McHardydf0933d2006-09-20 11:57:53 -0700548 struct ip_conntrack_helper *h;
549
550 list_for_each_entry(h, &helpers, list) {
551 if (ip_ct_tuple_mask_cmp(tuple, &h->tuple, &h->mask))
552 return h;
553 }
554 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700555}
556
Harald Welte080774a2005-08-09 19:32:58 -0700557struct ip_conntrack_helper *
558ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
559{
560 struct ip_conntrack_helper *helper;
561
562 /* need ip_conntrack_lock to assure that helper exists until
563 * try_module_get() is called */
564 read_lock_bh(&ip_conntrack_lock);
565
566 helper = __ip_conntrack_helper_find(tuple);
567 if (helper) {
568 /* need to increase module usage count to assure helper will
569 * not go away while the caller is e.g. busy putting a
570 * conntrack in the hash that uses the helper */
571 if (!try_module_get(helper->me))
572 helper = NULL;
573 }
574
575 read_unlock_bh(&ip_conntrack_lock);
576
577 return helper;
578}
579
580void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
581{
582 module_put(helper->me);
583}
584
585struct ip_conntrack_protocol *
586__ip_conntrack_proto_find(u_int8_t protocol)
587{
588 return ip_ct_protos[protocol];
589}
590
591/* this is guaranteed to always return a valid protocol helper, since
592 * it falls back to generic_protocol */
593struct ip_conntrack_protocol *
594ip_conntrack_proto_find_get(u_int8_t protocol)
595{
596 struct ip_conntrack_protocol *p;
597
598 preempt_disable();
599 p = __ip_conntrack_proto_find(protocol);
600 if (p) {
601 if (!try_module_get(p->me))
602 p = &ip_conntrack_generic_protocol;
603 }
604 preempt_enable();
605
606 return p;
607}
608
609void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
610{
611 module_put(p->me);
612}
613
614struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
615 struct ip_conntrack_tuple *repl)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700616{
617 struct ip_conntrack *conntrack;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700618
619 if (!ip_conntrack_hash_rnd_initted) {
620 get_random_bytes(&ip_conntrack_hash_rnd, 4);
621 ip_conntrack_hash_rnd_initted = 1;
622 }
623
Pablo Neira Ayuso5251e2d2006-09-20 12:01:06 -0700624 /* We don't want any race condition at early drop stage */
625 atomic_inc(&ip_conntrack_count);
626
Linus Torvalds1da177e2005-04-16 15:20:36 -0700627 if (ip_conntrack_max
Pablo Neira Ayuso5251e2d2006-09-20 12:01:06 -0700628 && atomic_read(&ip_conntrack_count) > ip_conntrack_max) {
Harald Welte080774a2005-08-09 19:32:58 -0700629 unsigned int hash = hash_conntrack(orig);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700630 /* Try dropping from this hash chain. */
631 if (!early_drop(&ip_conntrack_hash[hash])) {
Pablo Neira Ayuso5251e2d2006-09-20 12:01:06 -0700632 atomic_dec(&ip_conntrack_count);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700633 if (net_ratelimit())
634 printk(KERN_WARNING
635 "ip_conntrack: table full, dropping"
636 " packet.\n");
637 return ERR_PTR(-ENOMEM);
638 }
639 }
640
Robert P. J. Dayc3762222007-02-10 01:45:03 -0800641 conntrack = kmem_cache_zalloc(ip_conntrack_cachep, GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700642 if (!conntrack) {
643 DEBUGP("Can't allocate conntrack.\n");
Pablo Neira Ayuso5251e2d2006-09-20 12:01:06 -0700644 atomic_dec(&ip_conntrack_count);
Yasuyuki Kozakai7663f182005-08-09 20:24:15 -0700645 return ERR_PTR(-ENOMEM);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700646 }
647
Linus Torvalds1da177e2005-04-16 15:20:36 -0700648 atomic_set(&conntrack->ct_general.use, 1);
649 conntrack->ct_general.destroy = destroy_conntrack;
Harald Welte080774a2005-08-09 19:32:58 -0700650 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
651 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700652 /* Don't set timer yet: wait for confirmation */
653 init_timer(&conntrack->timeout);
654 conntrack->timeout.data = (unsigned long)conntrack;
655 conntrack->timeout.function = death_by_timeout;
656
Harald Welte080774a2005-08-09 19:32:58 -0700657 return conntrack;
658}
659
660void
661ip_conntrack_free(struct ip_conntrack *conntrack)
662{
663 atomic_dec(&ip_conntrack_count);
664 kmem_cache_free(ip_conntrack_cachep, conntrack);
665}
666
667/* Allocate a new conntrack: we return -ENOMEM if classification
668 * failed due to stress. Otherwise it really is unclassifiable */
669static struct ip_conntrack_tuple_hash *
670init_conntrack(struct ip_conntrack_tuple *tuple,
671 struct ip_conntrack_protocol *protocol,
672 struct sk_buff *skb)
673{
674 struct ip_conntrack *conntrack;
675 struct ip_conntrack_tuple repl_tuple;
676 struct ip_conntrack_expect *exp;
677
678 if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
679 DEBUGP("Can't invert tuple.\n");
680 return NULL;
681 }
682
Yasuyuki Kozakai7663f182005-08-09 20:24:15 -0700683 conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
684 if (conntrack == NULL || IS_ERR(conntrack))
685 return (struct ip_conntrack_tuple_hash *)conntrack;
Harald Welte080774a2005-08-09 19:32:58 -0700686
687 if (!protocol->new(conntrack, skb)) {
688 ip_conntrack_free(conntrack);
689 return NULL;
690 }
691
Patrick McHardye45b1be2005-06-21 14:01:30 -0700692 write_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700693 exp = find_expectation(tuple);
694
695 if (exp) {
696 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
697 conntrack, exp);
698 /* Welcome, Mr. Bond. We've been expecting you... */
699 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
700 conntrack->master = exp->master;
Nick Sillik7cee4322005-07-27 14:46:03 -0700701#ifdef CONFIG_IP_NF_CONNTRACK_MARK
Linus Torvalds1da177e2005-04-16 15:20:36 -0700702 conntrack->mark = exp->master->mark;
703#endif
Harald Welte1f494c02005-07-30 17:44:07 -0700704#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
705 defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
706 /* this is ugly, but there is no other place where to put it */
707 conntrack->nat.masq_index = exp->master->nat.masq_index;
708#endif
James Morris7c9728c2006-06-09 00:31:46 -0700709#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
710 conntrack->secmark = exp->master->secmark;
711#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700712 nf_conntrack_get(&conntrack->master->ct_general);
713 CONNTRACK_STAT_INC(expect_new);
714 } else {
Harald Welte080774a2005-08-09 19:32:58 -0700715 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700716
717 CONNTRACK_STAT_INC(new);
718 }
719
720 /* Overload tuple linked list to put us in unconfirmed list. */
721 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
722
Patrick McHardye45b1be2005-06-21 14:01:30 -0700723 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700724
725 if (exp) {
726 if (exp->expectfn)
727 exp->expectfn(conntrack, exp);
Rusty Russell4acdbdb2005-07-21 13:14:46 -0700728 ip_conntrack_expect_put(exp);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700729 }
730
731 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
732}
733
734/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
735static inline struct ip_conntrack *
736resolve_normal_ct(struct sk_buff *skb,
737 struct ip_conntrack_protocol *proto,
738 int *set_reply,
739 unsigned int hooknum,
740 enum ip_conntrack_info *ctinfo)
741{
742 struct ip_conntrack_tuple tuple;
743 struct ip_conntrack_tuple_hash *h;
744 struct ip_conntrack *ct;
745
746 IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
747
748 if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4,
749 &tuple,proto))
750 return NULL;
751
752 /* look for tuple match */
753 h = ip_conntrack_find_get(&tuple, NULL);
754 if (!h) {
755 h = init_conntrack(&tuple, proto, skb);
756 if (!h)
757 return NULL;
758 if (IS_ERR(h))
759 return (void *)h;
760 }
761 ct = tuplehash_to_ctrack(h);
762
763 /* It exists; we have (non-exclusive) reference. */
764 if (DIRECTION(h) == IP_CT_DIR_REPLY) {
765 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
766 /* Please set reply bit if this packet OK */
767 *set_reply = 1;
768 } else {
769 /* Once we've had two way comms, always ESTABLISHED. */
770 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
771 DEBUGP("ip_conntrack_in: normal packet for %p\n",
772 ct);
773 *ctinfo = IP_CT_ESTABLISHED;
774 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
775 DEBUGP("ip_conntrack_in: related packet for %p\n",
776 ct);
777 *ctinfo = IP_CT_RELATED;
778 } else {
779 DEBUGP("ip_conntrack_in: new packet for %p\n",
780 ct);
781 *ctinfo = IP_CT_NEW;
782 }
783 *set_reply = 0;
784 }
785 skb->nfct = &ct->ct_general;
786 skb->nfctinfo = *ctinfo;
787 return ct;
788}
789
790/* Netfilter hook itself. */
791unsigned int ip_conntrack_in(unsigned int hooknum,
792 struct sk_buff **pskb,
793 const struct net_device *in,
794 const struct net_device *out,
795 int (*okfn)(struct sk_buff *))
796{
797 struct ip_conntrack *ct;
798 enum ip_conntrack_info ctinfo;
799 struct ip_conntrack_protocol *proto;
Harald Welteac3247b2005-08-09 19:28:03 -0700800 int set_reply = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700801 int ret;
802
803 /* Previously seen (loopback or untracked)? Ignore. */
804 if ((*pskb)->nfct) {
805 CONNTRACK_STAT_INC(ignore);
806 return NF_ACCEPT;
807 }
808
809 /* Never happen */
810 if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
811 if (net_ratelimit()) {
812 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
813 (*pskb)->nh.iph->protocol, hooknum);
814 }
815 return NF_DROP;
816 }
817
Linus Torvalds1da177e2005-04-16 15:20:36 -0700818/* Doesn't cover locally-generated broadcast, so not worth it. */
819#if 0
820 /* Ignore broadcast: no `connection'. */
821 if ((*pskb)->pkt_type == PACKET_BROADCAST) {
822 printk("Broadcast packet!\n");
823 return NF_ACCEPT;
824 } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
825 == htonl(0x000000FF)) {
826 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
827 NIPQUAD((*pskb)->nh.iph->saddr),
828 NIPQUAD((*pskb)->nh.iph->daddr),
829 (*pskb)->sk, (*pskb)->pkt_type);
830 }
831#endif
832
Harald Welte080774a2005-08-09 19:32:58 -0700833 proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700834
835 /* It may be an special packet, error, unclean...
836 * inverse of the return code tells to the netfilter
837 * core what to do with the packet. */
838 if (proto->error != NULL
839 && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
840 CONNTRACK_STAT_INC(error);
841 CONNTRACK_STAT_INC(invalid);
842 return -ret;
843 }
844
845 if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
846 /* Not valid part of a connection */
847 CONNTRACK_STAT_INC(invalid);
848 return NF_ACCEPT;
849 }
850
851 if (IS_ERR(ct)) {
852 /* Too stressed to deal. */
853 CONNTRACK_STAT_INC(drop);
854 return NF_DROP;
855 }
856
857 IP_NF_ASSERT((*pskb)->nfct);
858
859 ret = proto->packet(ct, *pskb, ctinfo);
860 if (ret < 0) {
861 /* Invalid: inverse of the return code tells
862 * the netfilter core what to do*/
863 nf_conntrack_put((*pskb)->nfct);
864 (*pskb)->nfct = NULL;
865 CONNTRACK_STAT_INC(invalid);
866 return -ret;
867 }
868
Harald Welteac3247b2005-08-09 19:28:03 -0700869 if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
870 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700871
872 return ret;
873}
874
875int invert_tuplepr(struct ip_conntrack_tuple *inverse,
876 const struct ip_conntrack_tuple *orig)
877{
878 return ip_ct_invert_tuple(inverse, orig,
Harald Welte080774a2005-08-09 19:32:58 -0700879 __ip_conntrack_proto_find(orig->dst.protonum));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700880}
881
882/* Would two expected things clash? */
883static inline int expect_clash(const struct ip_conntrack_expect *a,
884 const struct ip_conntrack_expect *b)
885{
886 /* Part covered by intersection of masks must be unequal,
887 otherwise they clash */
888 struct ip_conntrack_tuple intersect_mask
889 = { { a->mask.src.ip & b->mask.src.ip,
890 { a->mask.src.u.all & b->mask.src.u.all } },
891 { a->mask.dst.ip & b->mask.dst.ip,
892 { a->mask.dst.u.all & b->mask.dst.u.all },
893 a->mask.dst.protonum & b->mask.dst.protonum } };
894
895 return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
896}
897
898static inline int expect_matches(const struct ip_conntrack_expect *a,
899 const struct ip_conntrack_expect *b)
900{
901 return a->master == b->master
902 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
903 && ip_ct_tuple_equal(&a->mask, &b->mask);
904}
905
906/* Generally a bad idea to call this: could have matched already. */
907void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
908{
909 struct ip_conntrack_expect *i;
910
Patrick McHardye45b1be2005-06-21 14:01:30 -0700911 write_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700912 /* choose the the oldest expectation to evict */
913 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
914 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
Pablo Neira Ayuso49719eb2005-09-06 15:10:46 -0700915 ip_ct_unlink_expect(i);
Patrick McHardye45b1be2005-06-21 14:01:30 -0700916 write_unlock_bh(&ip_conntrack_lock);
Rusty Russell4acdbdb2005-07-21 13:14:46 -0700917 ip_conntrack_expect_put(i);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700918 return;
919 }
920 }
Patrick McHardye45b1be2005-06-21 14:01:30 -0700921 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700922}
923
Pablo Neira Ayuso91c46e22005-09-06 15:10:23 -0700924/* We don't increase the master conntrack refcount for non-fulfilled
925 * conntracks. During the conntrack destruction, the expectations are
926 * always killed before the conntrack itself */
Rusty Russell4acdbdb2005-07-21 13:14:46 -0700927struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700928{
929 struct ip_conntrack_expect *new;
930
931 new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
932 if (!new) {
933 DEBUGP("expect_related: OOM allocating expect\n");
934 return NULL;
935 }
Rusty Russell4acdbdb2005-07-21 13:14:46 -0700936 new->master = me;
Rusty Russell4acdbdb2005-07-21 13:14:46 -0700937 atomic_set(&new->use, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700938 return new;
939}
940
Rusty Russell4acdbdb2005-07-21 13:14:46 -0700941void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700942{
Pablo Neira Ayuso91c46e22005-09-06 15:10:23 -0700943 if (atomic_dec_and_test(&exp->use))
Rusty Russell4acdbdb2005-07-21 13:14:46 -0700944 kmem_cache_free(ip_conntrack_expect_cachep, exp);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700945}
946
947static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
948{
Rusty Russell4acdbdb2005-07-21 13:14:46 -0700949 atomic_inc(&exp->use);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700950 exp->master->expecting++;
951 list_add(&exp->list, &ip_conntrack_expect_list);
952
Phil Oester1d3cdb42005-06-21 14:02:42 -0700953 init_timer(&exp->timeout);
954 exp->timeout.data = (unsigned long)exp;
955 exp->timeout.function = expectation_timed_out;
956 exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
957 add_timer(&exp->timeout);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700958
Harald Welte080774a2005-08-09 19:32:58 -0700959 exp->id = ++ip_conntrack_expect_next_id;
960 atomic_inc(&exp->use);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700961 CONNTRACK_STAT_INC(expect_create);
962}
963
964/* Race with expectations being used means we could have none to find; OK. */
965static void evict_oldest_expect(struct ip_conntrack *master)
966{
967 struct ip_conntrack_expect *i;
968
969 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
970 if (i->master == master) {
971 if (del_timer(&i->timeout)) {
Pablo Neira Ayuso49719eb2005-09-06 15:10:46 -0700972 ip_ct_unlink_expect(i);
Rusty Russell4acdbdb2005-07-21 13:14:46 -0700973 ip_conntrack_expect_put(i);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700974 }
975 break;
976 }
977 }
978}
979
980static inline int refresh_timer(struct ip_conntrack_expect *i)
981{
982 if (!del_timer(&i->timeout))
983 return 0;
984
985 i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
986 add_timer(&i->timeout);
987 return 1;
988}
989
990int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
991{
992 struct ip_conntrack_expect *i;
993 int ret;
994
995 DEBUGP("ip_conntrack_expect_related %p\n", related_to);
996 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
997 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
998
Patrick McHardye45b1be2005-06-21 14:01:30 -0700999 write_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001000 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
1001 if (expect_matches(i, expect)) {
1002 /* Refresh timer: if it's dying, ignore.. */
1003 if (refresh_timer(i)) {
1004 ret = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001005 goto out;
1006 }
1007 } else if (expect_clash(i, expect)) {
1008 ret = -EBUSY;
1009 goto out;
1010 }
1011 }
1012
1013 /* Will be over limit? */
1014 if (expect->master->helper->max_expected &&
1015 expect->master->expecting >= expect->master->helper->max_expected)
1016 evict_oldest_expect(expect->master);
1017
1018 ip_conntrack_expect_insert(expect);
Harald Welteac3247b2005-08-09 19:28:03 -07001019 ip_conntrack_expect_event(IPEXP_NEW, expect);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001020 ret = 0;
1021out:
Patrick McHardye45b1be2005-06-21 14:01:30 -07001022 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001023 return ret;
1024}
1025
1026/* Alter reply tuple (maybe alter helper). This is for NAT, and is
1027 implicitly racy: see __ip_conntrack_confirm */
1028void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1029 const struct ip_conntrack_tuple *newreply)
1030{
Patrick McHardye45b1be2005-06-21 14:01:30 -07001031 write_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001032 /* Should be unconfirmed, so not in hash table yet */
1033 IP_NF_ASSERT(!is_confirmed(conntrack));
1034
1035 DEBUGP("Altering reply tuple of %p to ", conntrack);
1036 DUMP_TUPLE(newreply);
1037
1038 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1039 if (!conntrack->master && conntrack->expecting == 0)
Harald Welte080774a2005-08-09 19:32:58 -07001040 conntrack->helper = __ip_conntrack_helper_find(newreply);
Patrick McHardye45b1be2005-06-21 14:01:30 -07001041 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001042}
1043
1044int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1045{
1046 BUG_ON(me->timeout == 0);
Patrick McHardye45b1be2005-06-21 14:01:30 -07001047 write_lock_bh(&ip_conntrack_lock);
Patrick McHardydf0933d2006-09-20 11:57:53 -07001048 list_add(&me->list, &helpers);
Patrick McHardye45b1be2005-06-21 14:01:30 -07001049 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001050
1051 return 0;
1052}
1053
Harald Welte080774a2005-08-09 19:32:58 -07001054struct ip_conntrack_helper *
1055__ip_conntrack_helper_find_byname(const char *name)
1056{
1057 struct ip_conntrack_helper *h;
1058
1059 list_for_each_entry(h, &helpers, list) {
1060 if (!strcmp(h->name, name))
1061 return h;
1062 }
1063
1064 return NULL;
1065}
1066
Patrick McHardydf0933d2006-09-20 11:57:53 -07001067static inline void unhelp(struct ip_conntrack_tuple_hash *i,
1068 const struct ip_conntrack_helper *me)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001069{
Harald Welteac3247b2005-08-09 19:28:03 -07001070 if (tuplehash_to_ctrack(i)->helper == me) {
1071 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001072 tuplehash_to_ctrack(i)->helper = NULL;
Harald Welteac3247b2005-08-09 19:28:03 -07001073 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001074}
1075
1076void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1077{
1078 unsigned int i;
Patrick McHardydf0933d2006-09-20 11:57:53 -07001079 struct ip_conntrack_tuple_hash *h;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001080 struct ip_conntrack_expect *exp, *tmp;
1081
1082 /* Need write lock here, to delete helper. */
Patrick McHardye45b1be2005-06-21 14:01:30 -07001083 write_lock_bh(&ip_conntrack_lock);
Patrick McHardydf0933d2006-09-20 11:57:53 -07001084 list_del(&me->list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001085
1086 /* Get rid of expectations */
1087 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
1088 if (exp->master->helper == me && del_timer(&exp->timeout)) {
Pablo Neira Ayuso49719eb2005-09-06 15:10:46 -07001089 ip_ct_unlink_expect(exp);
Rusty Russell4acdbdb2005-07-21 13:14:46 -07001090 ip_conntrack_expect_put(exp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001091 }
1092 }
1093 /* Get rid of expecteds, set helpers to NULL. */
Patrick McHardydf0933d2006-09-20 11:57:53 -07001094 list_for_each_entry(h, &unconfirmed, list)
1095 unhelp(h, me);
1096 for (i = 0; i < ip_conntrack_htable_size; i++) {
1097 list_for_each_entry(h, &ip_conntrack_hash[i], list)
1098 unhelp(h, me);
1099 }
Patrick McHardye45b1be2005-06-21 14:01:30 -07001100 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001101
1102 /* Someone could be still looking at the helper in a bh. */
1103 synchronize_net();
1104}
1105
Harald Welte1dfbab52005-09-22 23:46:57 -07001106/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1107void __ip_ct_refresh_acct(struct ip_conntrack *ct,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001108 enum ip_conntrack_info ctinfo,
1109 const struct sk_buff *skb,
Harald Welte1dfbab52005-09-22 23:46:57 -07001110 unsigned long extra_jiffies,
1111 int do_acct)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001112{
Harald Weltea051a8f2005-10-10 21:21:10 -07001113 int event = 0;
Harald Welte1dfbab52005-09-22 23:46:57 -07001114
Linus Torvalds1da177e2005-04-16 15:20:36 -07001115 IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
Harald Welte1dfbab52005-09-22 23:46:57 -07001116 IP_NF_ASSERT(skb);
1117
1118 write_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001119
Eric Leblond997ae832006-05-29 18:24:20 -07001120 /* Only update if this is not a fixed timeout */
1121 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
1122 write_unlock_bh(&ip_conntrack_lock);
1123 return;
1124 }
1125
Linus Torvalds1da177e2005-04-16 15:20:36 -07001126 /* If not in hash table, timer will not be active yet */
1127 if (!is_confirmed(ct)) {
1128 ct->timeout.expires = extra_jiffies;
Harald Weltea051a8f2005-10-10 21:21:10 -07001129 event = IPCT_REFRESH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001130 } else {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001131 /* Need del_timer for race avoidance (may already be dying). */
1132 if (del_timer(&ct->timeout)) {
1133 ct->timeout.expires = jiffies + extra_jiffies;
1134 add_timer(&ct->timeout);
Harald Weltea051a8f2005-10-10 21:21:10 -07001135 event = IPCT_REFRESH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001136 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001137 }
Harald Welte1dfbab52005-09-22 23:46:57 -07001138
1139#ifdef CONFIG_IP_NF_CT_ACCT
1140 if (do_acct) {
1141 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1142 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1143 ntohs(skb->nh.iph->tot_len);
Harald Weltea051a8f2005-10-10 21:21:10 -07001144 if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1145 || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1146 event |= IPCT_COUNTER_FILLING;
Harald Welte1dfbab52005-09-22 23:46:57 -07001147 }
1148#endif
1149
1150 write_unlock_bh(&ip_conntrack_lock);
1151
1152 /* must be unlocked when calling event cache */
Harald Weltea051a8f2005-10-10 21:21:10 -07001153 if (event)
1154 ip_conntrack_event_cache(event, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001155}
1156
Harald Welte080774a2005-08-09 19:32:58 -07001157#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1158 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1159/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1160 * in ip_conntrack_core, since we don't want the protocols to autoload
1161 * or depend on ctnetlink */
1162int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1163 const struct ip_conntrack_tuple *tuple)
1164{
Al Virocdcb71b2006-09-28 14:21:37 -07001165 NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(__be16),
Harald Welte080774a2005-08-09 19:32:58 -07001166 &tuple->src.u.tcp.port);
Al Virocdcb71b2006-09-28 14:21:37 -07001167 NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(__be16),
Harald Welte080774a2005-08-09 19:32:58 -07001168 &tuple->dst.u.tcp.port);
1169 return 0;
1170
1171nfattr_failure:
1172 return -1;
1173}
1174
1175int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1176 struct ip_conntrack_tuple *t)
1177{
1178 if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1179 return -EINVAL;
1180
1181 t->src.u.tcp.port =
Al Virocdcb71b2006-09-28 14:21:37 -07001182 *(__be16 *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
Harald Welte080774a2005-08-09 19:32:58 -07001183 t->dst.u.tcp.port =
Al Virocdcb71b2006-09-28 14:21:37 -07001184 *(__be16 *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
Harald Welte080774a2005-08-09 19:32:58 -07001185
1186 return 0;
1187}
1188#endif
1189
Linus Torvalds1da177e2005-04-16 15:20:36 -07001190/* Returns new sk_buff, or NULL */
1191struct sk_buff *
1192ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1193{
David S. Miller8be58932005-05-19 12:36:33 -07001194 skb_orphan(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001195
1196 local_bh_disable();
1197 skb = ip_defrag(skb, user);
1198 local_bh_enable();
1199
Harald Welte6869c4d2005-08-09 19:24:19 -07001200 if (skb)
David S. Miller8be58932005-05-19 12:36:33 -07001201 ip_send_check(skb->nh.iph);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001202 return skb;
1203}
1204
1205/* Used by ipt_REJECT. */
1206static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1207{
1208 struct ip_conntrack *ct;
1209 enum ip_conntrack_info ctinfo;
1210
1211 /* This ICMP is in reverse direction to the packet which caused it */
1212 ct = ip_conntrack_get(skb, &ctinfo);
1213
1214 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1215 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1216 else
1217 ctinfo = IP_CT_RELATED;
1218
1219 /* Attach to new skbuff, and increment count */
1220 nskb->nfct = &ct->ct_general;
1221 nskb->nfctinfo = ctinfo;
1222 nf_conntrack_get(nskb->nfct);
1223}
1224
Linus Torvalds1da177e2005-04-16 15:20:36 -07001225/* Bring out ya dead! */
Patrick McHardydf0933d2006-09-20 11:57:53 -07001226static struct ip_conntrack *
Linus Torvalds1da177e2005-04-16 15:20:36 -07001227get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1228 void *data, unsigned int *bucket)
1229{
Patrick McHardydf0933d2006-09-20 11:57:53 -07001230 struct ip_conntrack_tuple_hash *h;
1231 struct ip_conntrack *ct;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001232
Patrick McHardye45b1be2005-06-21 14:01:30 -07001233 write_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001234 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
Patrick McHardydf0933d2006-09-20 11:57:53 -07001235 list_for_each_entry(h, &ip_conntrack_hash[*bucket], list) {
1236 ct = tuplehash_to_ctrack(h);
1237 if (iter(ct, data))
1238 goto found;
1239 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001240 }
Patrick McHardydf0933d2006-09-20 11:57:53 -07001241 list_for_each_entry(h, &unconfirmed, list) {
1242 ct = tuplehash_to_ctrack(h);
1243 if (iter(ct, data))
1244 goto found;
1245 }
Patrick McHardye45b1be2005-06-21 14:01:30 -07001246 write_unlock_bh(&ip_conntrack_lock);
Patrick McHardydf0933d2006-09-20 11:57:53 -07001247 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001248
Patrick McHardydf0933d2006-09-20 11:57:53 -07001249found:
1250 atomic_inc(&ct->ct_general.use);
1251 write_unlock_bh(&ip_conntrack_lock);
1252 return ct;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001253}
1254
1255void
1256ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1257{
Patrick McHardydf0933d2006-09-20 11:57:53 -07001258 struct ip_conntrack *ct;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001259 unsigned int bucket = 0;
1260
Patrick McHardydf0933d2006-09-20 11:57:53 -07001261 while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001262 /* Time to push up daises... */
1263 if (del_timer(&ct->timeout))
1264 death_by_timeout((unsigned long)ct);
1265 /* ... else the timer will get him soon. */
1266
1267 ip_conntrack_put(ct);
1268 }
1269}
1270
1271/* Fast function for those who don't want to parse /proc (and I don't
1272 blame them). */
1273/* Reversing the socket's dst/src point of view gives us the reply
1274 mapping. */
1275static int
1276getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1277{
1278 struct inet_sock *inet = inet_sk(sk);
1279 struct ip_conntrack_tuple_hash *h;
1280 struct ip_conntrack_tuple tuple;
1281
1282 IP_CT_TUPLE_U_BLANK(&tuple);
1283 tuple.src.ip = inet->rcv_saddr;
1284 tuple.src.u.tcp.port = inet->sport;
1285 tuple.dst.ip = inet->daddr;
1286 tuple.dst.u.tcp.port = inet->dport;
1287 tuple.dst.protonum = IPPROTO_TCP;
1288
1289 /* We only do TCP at the moment: is there a better way? */
1290 if (strcmp(sk->sk_prot->name, "TCP")) {
1291 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1292 return -ENOPROTOOPT;
1293 }
1294
1295 if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1296 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1297 *len, sizeof(struct sockaddr_in));
1298 return -EINVAL;
1299 }
1300
1301 h = ip_conntrack_find_get(&tuple, NULL);
1302 if (h) {
1303 struct sockaddr_in sin;
1304 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1305
1306 sin.sin_family = AF_INET;
1307 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1308 .tuple.dst.u.tcp.port;
1309 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1310 .tuple.dst.ip;
Marcel Holtmann6c813c32006-05-28 22:50:18 -07001311 memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001312
1313 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1314 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1315 ip_conntrack_put(ct);
1316 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1317 return -EFAULT;
1318 else
1319 return 0;
1320 }
1321 DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1322 NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1323 NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1324 return -ENOENT;
1325}
1326
1327static struct nf_sockopt_ops so_getorigdst = {
1328 .pf = PF_INET,
1329 .get_optmin = SO_ORIGINAL_DST,
1330 .get_optmax = SO_ORIGINAL_DST+1,
1331 .get = &getorigdst,
1332};
1333
1334static int kill_all(struct ip_conntrack *i, void *data)
1335{
1336 return 1;
1337}
1338
Patrick McHardyafe5c6b2005-12-05 13:33:50 -08001339void ip_conntrack_flush(void)
1340{
1341 ip_ct_iterate_cleanup(kill_all, NULL);
1342}
1343
Harald Welteeed75f12005-10-16 14:22:59 +02001344static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001345{
Harald Welteeed75f12005-10-16 14:22:59 +02001346 if (vmalloced)
1347 vfree(hash);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001348 else
Harald Welteeed75f12005-10-16 14:22:59 +02001349 free_pages((unsigned long)hash,
1350 get_order(sizeof(struct list_head) * size));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001351}
1352
Patrick McHardyafe5c6b2005-12-05 13:33:50 -08001353/* Mishearing the voices in his head, our hero wonders how he's
1354 supposed to kill the mall. */
1355void ip_conntrack_cleanup(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001356{
Patrick McHardyafe5c6b2005-12-05 13:33:50 -08001357 ip_ct_attach = NULL;
1358
Linus Torvalds1da177e2005-04-16 15:20:36 -07001359 /* This makes sure all current packets have passed through
1360 netfilter framework. Roll on, two-stage module
1361 delete... */
1362 synchronize_net();
Harald Welte080774a2005-08-09 19:32:58 -07001363
Patrick McHardya86888b2005-08-09 20:02:13 -07001364 ip_ct_event_cache_flush();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001365 i_see_dead_people:
Patrick McHardyafe5c6b2005-12-05 13:33:50 -08001366 ip_conntrack_flush();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001367 if (atomic_read(&ip_conntrack_count) != 0) {
1368 schedule();
1369 goto i_see_dead_people;
1370 }
Patrick McHardy21f930e2005-07-22 12:51:03 -07001371 /* wait until all references to ip_conntrack_untracked are dropped */
1372 while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1373 schedule();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001374
1375 kmem_cache_destroy(ip_conntrack_cachep);
1376 kmem_cache_destroy(ip_conntrack_expect_cachep);
Harald Welteeed75f12005-10-16 14:22:59 +02001377 free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1378 ip_conntrack_htable_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001379 nf_unregister_sockopt(&so_getorigdst);
1380}
1381
Harald Welteeed75f12005-10-16 14:22:59 +02001382static struct list_head *alloc_hashtable(int size, int *vmalloced)
1383{
1384 struct list_head *hash;
1385 unsigned int i;
1386
1387 *vmalloced = 0;
1388 hash = (void*)__get_free_pages(GFP_KERNEL,
1389 get_order(sizeof(struct list_head)
1390 * size));
1391 if (!hash) {
1392 *vmalloced = 1;
1393 printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n");
1394 hash = vmalloc(sizeof(struct list_head) * size);
1395 }
1396
1397 if (hash)
1398 for (i = 0; i < size; i++)
1399 INIT_LIST_HEAD(&hash[i]);
1400
1401 return hash;
1402}
1403
Adrian Bunkd127e942005-11-29 16:28:18 -08001404static int set_hashsize(const char *val, struct kernel_param *kp)
Harald Welteeed75f12005-10-16 14:22:59 +02001405{
1406 int i, bucket, hashsize, vmalloced;
1407 int old_vmalloced, old_size;
1408 int rnd;
1409 struct list_head *hash, *old_hash;
1410 struct ip_conntrack_tuple_hash *h;
1411
1412 /* On boot, we can set this without any fancy locking. */
1413 if (!ip_conntrack_htable_size)
1414 return param_set_int(val, kp);
1415
1416 hashsize = simple_strtol(val, NULL, 0);
1417 if (!hashsize)
1418 return -EINVAL;
1419
1420 hash = alloc_hashtable(hashsize, &vmalloced);
1421 if (!hash)
1422 return -ENOMEM;
1423
1424 /* We have to rehash for the new table anyway, so we also can
1425 * use a new random seed */
1426 get_random_bytes(&rnd, 4);
1427
1428 write_lock_bh(&ip_conntrack_lock);
1429 for (i = 0; i < ip_conntrack_htable_size; i++) {
1430 while (!list_empty(&ip_conntrack_hash[i])) {
1431 h = list_entry(ip_conntrack_hash[i].next,
1432 struct ip_conntrack_tuple_hash, list);
1433 list_del(&h->list);
1434 bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1435 list_add_tail(&h->list, &hash[bucket]);
1436 }
1437 }
1438 old_size = ip_conntrack_htable_size;
1439 old_vmalloced = ip_conntrack_vmalloc;
1440 old_hash = ip_conntrack_hash;
1441
1442 ip_conntrack_htable_size = hashsize;
1443 ip_conntrack_vmalloc = vmalloced;
1444 ip_conntrack_hash = hash;
1445 ip_conntrack_hash_rnd = rnd;
1446 write_unlock_bh(&ip_conntrack_lock);
1447
1448 free_conntrack_hash(old_hash, old_vmalloced, old_size);
1449 return 0;
1450}
1451
1452module_param_call(hashsize, set_hashsize, param_get_uint,
1453 &ip_conntrack_htable_size, 0600);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001454
1455int __init ip_conntrack_init(void)
1456{
1457 unsigned int i;
1458 int ret;
1459
1460 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1461 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
Harald Welteeed75f12005-10-16 14:22:59 +02001462 if (!ip_conntrack_htable_size) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001463 ip_conntrack_htable_size
1464 = (((num_physpages << PAGE_SHIFT) / 16384)
1465 / sizeof(struct list_head));
1466 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1467 ip_conntrack_htable_size = 8192;
1468 if (ip_conntrack_htable_size < 16)
1469 ip_conntrack_htable_size = 16;
1470 }
1471 ip_conntrack_max = 8 * ip_conntrack_htable_size;
1472
1473 printk("ip_conntrack version %s (%u buckets, %d max)"
1474 " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1475 ip_conntrack_htable_size, ip_conntrack_max,
1476 sizeof(struct ip_conntrack));
1477
1478 ret = nf_register_sockopt(&so_getorigdst);
1479 if (ret != 0) {
1480 printk(KERN_ERR "Unable to register netfilter socket option\n");
1481 return ret;
1482 }
1483
Harald Welteeed75f12005-10-16 14:22:59 +02001484 ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size,
1485 &ip_conntrack_vmalloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001486 if (!ip_conntrack_hash) {
1487 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1488 goto err_unreg_sockopt;
1489 }
1490
1491 ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1492 sizeof(struct ip_conntrack), 0,
1493 0, NULL, NULL);
1494 if (!ip_conntrack_cachep) {
1495 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1496 goto err_free_hash;
1497 }
1498
1499 ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1500 sizeof(struct ip_conntrack_expect),
1501 0, 0, NULL, NULL);
1502 if (!ip_conntrack_expect_cachep) {
1503 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1504 goto err_free_conntrack_slab;
1505 }
1506
1507 /* Don't NEED lock here, but good form anyway. */
Patrick McHardye45b1be2005-06-21 14:01:30 -07001508 write_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001509 for (i = 0; i < MAX_IP_CT_PROTO; i++)
1510 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1511 /* Sew in builtin protocols. */
1512 ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1513 ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1514 ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
Patrick McHardye45b1be2005-06-21 14:01:30 -07001515 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001516
Linus Torvalds1da177e2005-04-16 15:20:36 -07001517 /* For use by ipt_REJECT */
1518 ip_ct_attach = ip_conntrack_attach;
1519
1520 /* Set up fake conntrack:
1521 - to never be deleted, not in any hashes */
1522 atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1523 /* - and look it like as a confirmed connection */
1524 set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1525
1526 return ret;
1527
1528err_free_conntrack_slab:
1529 kmem_cache_destroy(ip_conntrack_cachep);
1530err_free_hash:
Harald Welteeed75f12005-10-16 14:22:59 +02001531 free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1532 ip_conntrack_htable_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001533err_unreg_sockopt:
1534 nf_unregister_sockopt(&so_getorigdst);
1535
1536 return -ENOMEM;
1537}