blob: 4b78ebeb66350e9616bf03d6468fc039b53716e6 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
3 extension. */
4
5/* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 *
12 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13 * - new API and handling of conntrack/nat helpers
14 * - now capable of multiple expectations for one master
15 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16 * - add usage/reference counts to ip_conntrack_expect
17 * - export ip_conntrack[_expect]_{find_get,put} functions
18 * */
19
20#include <linux/config.h>
21#include <linux/types.h>
22#include <linux/icmp.h>
23#include <linux/ip.h>
24#include <linux/netfilter.h>
25#include <linux/netfilter_ipv4.h>
26#include <linux/module.h>
27#include <linux/skbuff.h>
28#include <linux/proc_fs.h>
29#include <linux/vmalloc.h>
30#include <net/checksum.h>
31#include <net/ip.h>
32#include <linux/stddef.h>
33#include <linux/sysctl.h>
34#include <linux/slab.h>
35#include <linux/random.h>
36#include <linux/jhash.h>
37#include <linux/err.h>
38#include <linux/percpu.h>
39#include <linux/moduleparam.h>
40
Patrick McHardye45b1be2005-06-21 14:01:30 -070041/* ip_conntrack_lock protects the main hash table, protocol/helper/expected
Linus Torvalds1da177e2005-04-16 15:20:36 -070042 registrations, conntrack timers*/
Patrick McHardye45b1be2005-06-21 14:01:30 -070043#define ASSERT_READ_LOCK(x)
44#define ASSERT_WRITE_LOCK(x)
Linus Torvalds1da177e2005-04-16 15:20:36 -070045
46#include <linux/netfilter_ipv4/ip_conntrack.h>
47#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
48#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
49#include <linux/netfilter_ipv4/ip_conntrack_core.h>
50#include <linux/netfilter_ipv4/listhelp.h>
51
52#define IP_CONNTRACK_VERSION "2.1"
53
54#if 0
55#define DEBUGP printk
56#else
57#define DEBUGP(format, args...)
58#endif
59
Patrick McHardye45b1be2005-06-21 14:01:30 -070060DEFINE_RWLOCK(ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -070061
62/* ip_conntrack_standalone needs this */
63atomic_t ip_conntrack_count = ATOMIC_INIT(0);
64
65void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
66LIST_HEAD(ip_conntrack_expect_list);
67struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
68static LIST_HEAD(helpers);
69unsigned int ip_conntrack_htable_size = 0;
70int ip_conntrack_max;
71struct list_head *ip_conntrack_hash;
72static kmem_cache_t *ip_conntrack_cachep;
73static kmem_cache_t *ip_conntrack_expect_cachep;
74struct ip_conntrack ip_conntrack_untracked;
75unsigned int ip_ct_log_invalid;
76static LIST_HEAD(unconfirmed);
77static int ip_conntrack_vmalloc;
78
79DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
80
81void
82ip_conntrack_put(struct ip_conntrack *ct)
83{
84 IP_NF_ASSERT(ct);
85 nf_conntrack_put(&ct->ct_general);
86}
87
88static int ip_conntrack_hash_rnd_initted;
89static unsigned int ip_conntrack_hash_rnd;
90
91static u_int32_t
92hash_conntrack(const struct ip_conntrack_tuple *tuple)
93{
94#if 0
95 dump_tuple(tuple);
96#endif
97 return (jhash_3words(tuple->src.ip,
98 (tuple->dst.ip ^ tuple->dst.protonum),
99 (tuple->src.u.all | (tuple->dst.u.all << 16)),
100 ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
101}
102
103int
104ip_ct_get_tuple(const struct iphdr *iph,
105 const struct sk_buff *skb,
106 unsigned int dataoff,
107 struct ip_conntrack_tuple *tuple,
108 const struct ip_conntrack_protocol *protocol)
109{
110 /* Never happen */
111 if (iph->frag_off & htons(IP_OFFSET)) {
112 printk("ip_conntrack_core: Frag of proto %u.\n",
113 iph->protocol);
114 return 0;
115 }
116
117 tuple->src.ip = iph->saddr;
118 tuple->dst.ip = iph->daddr;
119 tuple->dst.protonum = iph->protocol;
120 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
121
122 return protocol->pkt_to_tuple(skb, dataoff, tuple);
123}
124
125int
126ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
127 const struct ip_conntrack_tuple *orig,
128 const struct ip_conntrack_protocol *protocol)
129{
130 inverse->src.ip = orig->dst.ip;
131 inverse->dst.ip = orig->src.ip;
132 inverse->dst.protonum = orig->dst.protonum;
133 inverse->dst.dir = !orig->dst.dir;
134
135 return protocol->invert_tuple(inverse, orig);
136}
137
138
139/* ip_conntrack_expect helper functions */
140static void destroy_expect(struct ip_conntrack_expect *exp)
141{
142 ip_conntrack_put(exp->master);
143 IP_NF_ASSERT(!timer_pending(&exp->timeout));
144 kmem_cache_free(ip_conntrack_expect_cachep, exp);
145 CONNTRACK_STAT_INC(expect_delete);
146}
147
148static void unlink_expect(struct ip_conntrack_expect *exp)
149{
Patrick McHardye45b1be2005-06-21 14:01:30 -0700150 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700151 list_del(&exp->list);
152 /* Logically in destroy_expect, but we hold the lock here. */
153 exp->master->expecting--;
154}
155
156static void expectation_timed_out(unsigned long ul_expect)
157{
158 struct ip_conntrack_expect *exp = (void *)ul_expect;
159
Patrick McHardye45b1be2005-06-21 14:01:30 -0700160 write_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700161 unlink_expect(exp);
Patrick McHardye45b1be2005-06-21 14:01:30 -0700162 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700163 destroy_expect(exp);
164}
165
166/* If an expectation for this connection is found, it gets delete from
167 * global list then returned. */
168static struct ip_conntrack_expect *
169find_expectation(const struct ip_conntrack_tuple *tuple)
170{
171 struct ip_conntrack_expect *i;
172
173 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
174 /* If master is not in hash table yet (ie. packet hasn't left
175 this machine yet), how can other end know about expected?
176 Hence these are not the droids you are looking for (if
177 master ct never got confirmed, we'd hold a reference to it
178 and weird things would happen to future packets). */
179 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
180 && is_confirmed(i->master)
181 && del_timer(&i->timeout)) {
182 unlink_expect(i);
183 return i;
184 }
185 }
186 return NULL;
187}
188
189/* delete all expectations for this conntrack */
190static void remove_expectations(struct ip_conntrack *ct)
191{
192 struct ip_conntrack_expect *i, *tmp;
193
194 /* Optimization: most connection never expect any others. */
195 if (ct->expecting == 0)
196 return;
197
198 list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
199 if (i->master == ct && del_timer(&i->timeout)) {
200 unlink_expect(i);
201 destroy_expect(i);
202 }
203 }
204}
205
206static void
207clean_from_lists(struct ip_conntrack *ct)
208{
209 unsigned int ho, hr;
210
211 DEBUGP("clean_from_lists(%p)\n", ct);
Patrick McHardye45b1be2005-06-21 14:01:30 -0700212 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700213
214 ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
215 hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
216 LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
217 LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
218
219 /* Destroy all pending expectations */
220 remove_expectations(ct);
221}
222
223static void
224destroy_conntrack(struct nf_conntrack *nfct)
225{
226 struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
227 struct ip_conntrack_protocol *proto;
228
229 DEBUGP("destroy_conntrack(%p)\n", ct);
230 IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
231 IP_NF_ASSERT(!timer_pending(&ct->timeout));
232
233 /* To make sure we don't get any weird locking issues here:
234 * destroy_conntrack() MUST NOT be called with a write lock
235 * to ip_conntrack_lock!!! -HW */
236 proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
237 if (proto && proto->destroy)
238 proto->destroy(ct);
239
240 if (ip_conntrack_destroyed)
241 ip_conntrack_destroyed(ct);
242
Patrick McHardye45b1be2005-06-21 14:01:30 -0700243 write_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700244 /* Expectations will have been removed in clean_from_lists,
245 * except TFTP can create an expectation on the first packet,
246 * before connection is in the list, so we need to clean here,
247 * too. */
248 remove_expectations(ct);
249
250 /* We overload first tuple to link into unconfirmed list. */
251 if (!is_confirmed(ct)) {
252 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
253 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
254 }
255
256 CONNTRACK_STAT_INC(delete);
Patrick McHardye45b1be2005-06-21 14:01:30 -0700257 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700258
259 if (ct->master)
260 ip_conntrack_put(ct->master);
261
262 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
263 kmem_cache_free(ip_conntrack_cachep, ct);
264 atomic_dec(&ip_conntrack_count);
265}
266
267static void death_by_timeout(unsigned long ul_conntrack)
268{
269 struct ip_conntrack *ct = (void *)ul_conntrack;
270
Patrick McHardye45b1be2005-06-21 14:01:30 -0700271 write_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700272 /* Inside lock so preempt is disabled on module removal path.
273 * Otherwise we can get spurious warnings. */
274 CONNTRACK_STAT_INC(delete_list);
275 clean_from_lists(ct);
Patrick McHardye45b1be2005-06-21 14:01:30 -0700276 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700277 ip_conntrack_put(ct);
278}
279
280static inline int
281conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
282 const struct ip_conntrack_tuple *tuple,
283 const struct ip_conntrack *ignored_conntrack)
284{
Patrick McHardye45b1be2005-06-21 14:01:30 -0700285 ASSERT_READ_LOCK(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700286 return tuplehash_to_ctrack(i) != ignored_conntrack
287 && ip_ct_tuple_equal(tuple, &i->tuple);
288}
289
290static struct ip_conntrack_tuple_hash *
291__ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
292 const struct ip_conntrack *ignored_conntrack)
293{
294 struct ip_conntrack_tuple_hash *h;
295 unsigned int hash = hash_conntrack(tuple);
296
Patrick McHardye45b1be2005-06-21 14:01:30 -0700297 ASSERT_READ_LOCK(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700298 list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
299 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
300 CONNTRACK_STAT_INC(found);
301 return h;
302 }
303 CONNTRACK_STAT_INC(searched);
304 }
305
306 return NULL;
307}
308
309/* Find a connection corresponding to a tuple. */
310struct ip_conntrack_tuple_hash *
311ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
312 const struct ip_conntrack *ignored_conntrack)
313{
314 struct ip_conntrack_tuple_hash *h;
315
Patrick McHardye45b1be2005-06-21 14:01:30 -0700316 read_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700317 h = __ip_conntrack_find(tuple, ignored_conntrack);
318 if (h)
319 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
Patrick McHardye45b1be2005-06-21 14:01:30 -0700320 read_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700321
322 return h;
323}
324
325/* Confirm a connection given skb; places it in hash table */
326int
327__ip_conntrack_confirm(struct sk_buff **pskb)
328{
329 unsigned int hash, repl_hash;
330 struct ip_conntrack *ct;
331 enum ip_conntrack_info ctinfo;
332
333 ct = ip_conntrack_get(*pskb, &ctinfo);
334
335 /* ipt_REJECT uses ip_conntrack_attach to attach related
336 ICMP/TCP RST packets in other direction. Actual packet
337 which created connection will be IP_CT_NEW or for an
338 expected connection, IP_CT_RELATED. */
339 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
340 return NF_ACCEPT;
341
342 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
343 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
344
345 /* We're not in hash table, and we refuse to set up related
346 connections for unconfirmed conns. But packet copies and
347 REJECT will give spurious warnings here. */
348 /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
349
350 /* No external references means noone else could have
351 confirmed us. */
352 IP_NF_ASSERT(!is_confirmed(ct));
353 DEBUGP("Confirming conntrack %p\n", ct);
354
Patrick McHardye45b1be2005-06-21 14:01:30 -0700355 write_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700356
357 /* See if there's one in the list already, including reverse:
358 NAT could have grabbed it without realizing, since we're
359 not in the hash. If there is, we lost race. */
360 if (!LIST_FIND(&ip_conntrack_hash[hash],
361 conntrack_tuple_cmp,
362 struct ip_conntrack_tuple_hash *,
363 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
364 && !LIST_FIND(&ip_conntrack_hash[repl_hash],
365 conntrack_tuple_cmp,
366 struct ip_conntrack_tuple_hash *,
367 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
368 /* Remove from unconfirmed list */
369 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
370
371 list_prepend(&ip_conntrack_hash[hash],
372 &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
373 list_prepend(&ip_conntrack_hash[repl_hash],
374 &ct->tuplehash[IP_CT_DIR_REPLY]);
375 /* Timer relative to confirmation time, not original
376 setting time, otherwise we'd get timer wrap in
377 weird delay cases. */
378 ct->timeout.expires += jiffies;
379 add_timer(&ct->timeout);
380 atomic_inc(&ct->ct_general.use);
381 set_bit(IPS_CONFIRMED_BIT, &ct->status);
382 CONNTRACK_STAT_INC(insert);
Patrick McHardye45b1be2005-06-21 14:01:30 -0700383 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700384 return NF_ACCEPT;
385 }
386
387 CONNTRACK_STAT_INC(insert_failed);
Patrick McHardye45b1be2005-06-21 14:01:30 -0700388 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700389
390 return NF_DROP;
391}
392
393/* Returns true if a connection correspondings to the tuple (required
394 for NAT). */
395int
396ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
397 const struct ip_conntrack *ignored_conntrack)
398{
399 struct ip_conntrack_tuple_hash *h;
400
Patrick McHardye45b1be2005-06-21 14:01:30 -0700401 read_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700402 h = __ip_conntrack_find(tuple, ignored_conntrack);
Patrick McHardye45b1be2005-06-21 14:01:30 -0700403 read_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700404
405 return h != NULL;
406}
407
408/* There's a small race here where we may free a just-assured
409 connection. Too bad: we're in trouble anyway. */
410static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
411{
412 return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
413}
414
415static int early_drop(struct list_head *chain)
416{
417 /* Traverse backwards: gives us oldest, which is roughly LRU */
418 struct ip_conntrack_tuple_hash *h;
419 struct ip_conntrack *ct = NULL;
420 int dropped = 0;
421
Patrick McHardye45b1be2005-06-21 14:01:30 -0700422 read_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700423 h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
424 if (h) {
425 ct = tuplehash_to_ctrack(h);
426 atomic_inc(&ct->ct_general.use);
427 }
Patrick McHardye45b1be2005-06-21 14:01:30 -0700428 read_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700429
430 if (!ct)
431 return dropped;
432
433 if (del_timer(&ct->timeout)) {
434 death_by_timeout((unsigned long)ct);
435 dropped = 1;
436 CONNTRACK_STAT_INC(early_drop);
437 }
438 ip_conntrack_put(ct);
439 return dropped;
440}
441
442static inline int helper_cmp(const struct ip_conntrack_helper *i,
443 const struct ip_conntrack_tuple *rtuple)
444{
445 return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
446}
447
448static struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
449{
450 return LIST_FIND(&helpers, helper_cmp,
451 struct ip_conntrack_helper *,
452 tuple);
453}
454
455/* Allocate a new conntrack: we return -ENOMEM if classification
456 failed due to stress. Otherwise it really is unclassifiable. */
457static struct ip_conntrack_tuple_hash *
458init_conntrack(const struct ip_conntrack_tuple *tuple,
459 struct ip_conntrack_protocol *protocol,
460 struct sk_buff *skb)
461{
462 struct ip_conntrack *conntrack;
463 struct ip_conntrack_tuple repl_tuple;
464 size_t hash;
465 struct ip_conntrack_expect *exp;
466
467 if (!ip_conntrack_hash_rnd_initted) {
468 get_random_bytes(&ip_conntrack_hash_rnd, 4);
469 ip_conntrack_hash_rnd_initted = 1;
470 }
471
472 hash = hash_conntrack(tuple);
473
474 if (ip_conntrack_max
475 && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
476 /* Try dropping from this hash chain. */
477 if (!early_drop(&ip_conntrack_hash[hash])) {
478 if (net_ratelimit())
479 printk(KERN_WARNING
480 "ip_conntrack: table full, dropping"
481 " packet.\n");
482 return ERR_PTR(-ENOMEM);
483 }
484 }
485
486 if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
487 DEBUGP("Can't invert tuple.\n");
488 return NULL;
489 }
490
491 conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
492 if (!conntrack) {
493 DEBUGP("Can't allocate conntrack.\n");
494 return ERR_PTR(-ENOMEM);
495 }
496
497 memset(conntrack, 0, sizeof(*conntrack));
498 atomic_set(&conntrack->ct_general.use, 1);
499 conntrack->ct_general.destroy = destroy_conntrack;
500 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
501 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
502 if (!protocol->new(conntrack, skb)) {
503 kmem_cache_free(ip_conntrack_cachep, conntrack);
504 return NULL;
505 }
506 /* Don't set timer yet: wait for confirmation */
507 init_timer(&conntrack->timeout);
508 conntrack->timeout.data = (unsigned long)conntrack;
509 conntrack->timeout.function = death_by_timeout;
510
Patrick McHardye45b1be2005-06-21 14:01:30 -0700511 write_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700512 exp = find_expectation(tuple);
513
514 if (exp) {
515 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
516 conntrack, exp);
517 /* Welcome, Mr. Bond. We've been expecting you... */
518 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
519 conntrack->master = exp->master;
520#if CONFIG_IP_NF_CONNTRACK_MARK
521 conntrack->mark = exp->master->mark;
522#endif
523 nf_conntrack_get(&conntrack->master->ct_general);
524 CONNTRACK_STAT_INC(expect_new);
525 } else {
526 conntrack->helper = ip_ct_find_helper(&repl_tuple);
527
528 CONNTRACK_STAT_INC(new);
529 }
530
531 /* Overload tuple linked list to put us in unconfirmed list. */
532 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
533
534 atomic_inc(&ip_conntrack_count);
Patrick McHardye45b1be2005-06-21 14:01:30 -0700535 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700536
537 if (exp) {
538 if (exp->expectfn)
539 exp->expectfn(conntrack, exp);
540 destroy_expect(exp);
541 }
542
543 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
544}
545
546/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
547static inline struct ip_conntrack *
548resolve_normal_ct(struct sk_buff *skb,
549 struct ip_conntrack_protocol *proto,
550 int *set_reply,
551 unsigned int hooknum,
552 enum ip_conntrack_info *ctinfo)
553{
554 struct ip_conntrack_tuple tuple;
555 struct ip_conntrack_tuple_hash *h;
556 struct ip_conntrack *ct;
557
558 IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
559
560 if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4,
561 &tuple,proto))
562 return NULL;
563
564 /* look for tuple match */
565 h = ip_conntrack_find_get(&tuple, NULL);
566 if (!h) {
567 h = init_conntrack(&tuple, proto, skb);
568 if (!h)
569 return NULL;
570 if (IS_ERR(h))
571 return (void *)h;
572 }
573 ct = tuplehash_to_ctrack(h);
574
575 /* It exists; we have (non-exclusive) reference. */
576 if (DIRECTION(h) == IP_CT_DIR_REPLY) {
577 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
578 /* Please set reply bit if this packet OK */
579 *set_reply = 1;
580 } else {
581 /* Once we've had two way comms, always ESTABLISHED. */
582 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
583 DEBUGP("ip_conntrack_in: normal packet for %p\n",
584 ct);
585 *ctinfo = IP_CT_ESTABLISHED;
586 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
587 DEBUGP("ip_conntrack_in: related packet for %p\n",
588 ct);
589 *ctinfo = IP_CT_RELATED;
590 } else {
591 DEBUGP("ip_conntrack_in: new packet for %p\n",
592 ct);
593 *ctinfo = IP_CT_NEW;
594 }
595 *set_reply = 0;
596 }
597 skb->nfct = &ct->ct_general;
598 skb->nfctinfo = *ctinfo;
599 return ct;
600}
601
602/* Netfilter hook itself. */
603unsigned int ip_conntrack_in(unsigned int hooknum,
604 struct sk_buff **pskb,
605 const struct net_device *in,
606 const struct net_device *out,
607 int (*okfn)(struct sk_buff *))
608{
609 struct ip_conntrack *ct;
610 enum ip_conntrack_info ctinfo;
611 struct ip_conntrack_protocol *proto;
612 int set_reply;
613 int ret;
614
615 /* Previously seen (loopback or untracked)? Ignore. */
616 if ((*pskb)->nfct) {
617 CONNTRACK_STAT_INC(ignore);
618 return NF_ACCEPT;
619 }
620
621 /* Never happen */
622 if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
623 if (net_ratelimit()) {
624 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
625 (*pskb)->nh.iph->protocol, hooknum);
626 }
627 return NF_DROP;
628 }
629
630 /* FIXME: Do this right please. --RR */
631 (*pskb)->nfcache |= NFC_UNKNOWN;
632
633/* Doesn't cover locally-generated broadcast, so not worth it. */
634#if 0
635 /* Ignore broadcast: no `connection'. */
636 if ((*pskb)->pkt_type == PACKET_BROADCAST) {
637 printk("Broadcast packet!\n");
638 return NF_ACCEPT;
639 } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
640 == htonl(0x000000FF)) {
641 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
642 NIPQUAD((*pskb)->nh.iph->saddr),
643 NIPQUAD((*pskb)->nh.iph->daddr),
644 (*pskb)->sk, (*pskb)->pkt_type);
645 }
646#endif
647
648 proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
649
650 /* It may be an special packet, error, unclean...
651 * inverse of the return code tells to the netfilter
652 * core what to do with the packet. */
653 if (proto->error != NULL
654 && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
655 CONNTRACK_STAT_INC(error);
656 CONNTRACK_STAT_INC(invalid);
657 return -ret;
658 }
659
660 if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
661 /* Not valid part of a connection */
662 CONNTRACK_STAT_INC(invalid);
663 return NF_ACCEPT;
664 }
665
666 if (IS_ERR(ct)) {
667 /* Too stressed to deal. */
668 CONNTRACK_STAT_INC(drop);
669 return NF_DROP;
670 }
671
672 IP_NF_ASSERT((*pskb)->nfct);
673
674 ret = proto->packet(ct, *pskb, ctinfo);
675 if (ret < 0) {
676 /* Invalid: inverse of the return code tells
677 * the netfilter core what to do*/
678 nf_conntrack_put((*pskb)->nfct);
679 (*pskb)->nfct = NULL;
680 CONNTRACK_STAT_INC(invalid);
681 return -ret;
682 }
683
684 if (set_reply)
685 set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
686
687 return ret;
688}
689
690int invert_tuplepr(struct ip_conntrack_tuple *inverse,
691 const struct ip_conntrack_tuple *orig)
692{
693 return ip_ct_invert_tuple(inverse, orig,
694 ip_ct_find_proto(orig->dst.protonum));
695}
696
697/* Would two expected things clash? */
698static inline int expect_clash(const struct ip_conntrack_expect *a,
699 const struct ip_conntrack_expect *b)
700{
701 /* Part covered by intersection of masks must be unequal,
702 otherwise they clash */
703 struct ip_conntrack_tuple intersect_mask
704 = { { a->mask.src.ip & b->mask.src.ip,
705 { a->mask.src.u.all & b->mask.src.u.all } },
706 { a->mask.dst.ip & b->mask.dst.ip,
707 { a->mask.dst.u.all & b->mask.dst.u.all },
708 a->mask.dst.protonum & b->mask.dst.protonum } };
709
710 return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
711}
712
713static inline int expect_matches(const struct ip_conntrack_expect *a,
714 const struct ip_conntrack_expect *b)
715{
716 return a->master == b->master
717 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
718 && ip_ct_tuple_equal(&a->mask, &b->mask);
719}
720
721/* Generally a bad idea to call this: could have matched already. */
722void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
723{
724 struct ip_conntrack_expect *i;
725
Patrick McHardye45b1be2005-06-21 14:01:30 -0700726 write_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700727 /* choose the the oldest expectation to evict */
728 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
729 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
730 unlink_expect(i);
Patrick McHardye45b1be2005-06-21 14:01:30 -0700731 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700732 destroy_expect(i);
733 return;
734 }
735 }
Patrick McHardye45b1be2005-06-21 14:01:30 -0700736 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700737}
738
739struct ip_conntrack_expect *ip_conntrack_expect_alloc(void)
740{
741 struct ip_conntrack_expect *new;
742
743 new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
744 if (!new) {
745 DEBUGP("expect_related: OOM allocating expect\n");
746 return NULL;
747 }
748 new->master = NULL;
749 return new;
750}
751
752void ip_conntrack_expect_free(struct ip_conntrack_expect *expect)
753{
754 kmem_cache_free(ip_conntrack_expect_cachep, expect);
755}
756
757static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
758{
759 atomic_inc(&exp->master->ct_general.use);
760 exp->master->expecting++;
761 list_add(&exp->list, &ip_conntrack_expect_list);
762
Phil Oester1d3cdb42005-06-21 14:02:42 -0700763 init_timer(&exp->timeout);
764 exp->timeout.data = (unsigned long)exp;
765 exp->timeout.function = expectation_timed_out;
766 exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
767 add_timer(&exp->timeout);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700768
769 CONNTRACK_STAT_INC(expect_create);
770}
771
772/* Race with expectations being used means we could have none to find; OK. */
773static void evict_oldest_expect(struct ip_conntrack *master)
774{
775 struct ip_conntrack_expect *i;
776
777 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
778 if (i->master == master) {
779 if (del_timer(&i->timeout)) {
780 unlink_expect(i);
781 destroy_expect(i);
782 }
783 break;
784 }
785 }
786}
787
788static inline int refresh_timer(struct ip_conntrack_expect *i)
789{
790 if (!del_timer(&i->timeout))
791 return 0;
792
793 i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
794 add_timer(&i->timeout);
795 return 1;
796}
797
798int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
799{
800 struct ip_conntrack_expect *i;
801 int ret;
802
803 DEBUGP("ip_conntrack_expect_related %p\n", related_to);
804 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
805 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
806
Patrick McHardye45b1be2005-06-21 14:01:30 -0700807 write_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700808 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
809 if (expect_matches(i, expect)) {
810 /* Refresh timer: if it's dying, ignore.. */
811 if (refresh_timer(i)) {
812 ret = 0;
813 /* We don't need the one they've given us. */
814 ip_conntrack_expect_free(expect);
815 goto out;
816 }
817 } else if (expect_clash(i, expect)) {
818 ret = -EBUSY;
819 goto out;
820 }
821 }
822
823 /* Will be over limit? */
824 if (expect->master->helper->max_expected &&
825 expect->master->expecting >= expect->master->helper->max_expected)
826 evict_oldest_expect(expect->master);
827
828 ip_conntrack_expect_insert(expect);
829 ret = 0;
830out:
Patrick McHardye45b1be2005-06-21 14:01:30 -0700831 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700832 return ret;
833}
834
835/* Alter reply tuple (maybe alter helper). This is for NAT, and is
836 implicitly racy: see __ip_conntrack_confirm */
837void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
838 const struct ip_conntrack_tuple *newreply)
839{
Patrick McHardye45b1be2005-06-21 14:01:30 -0700840 write_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700841 /* Should be unconfirmed, so not in hash table yet */
842 IP_NF_ASSERT(!is_confirmed(conntrack));
843
844 DEBUGP("Altering reply tuple of %p to ", conntrack);
845 DUMP_TUPLE(newreply);
846
847 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
848 if (!conntrack->master && conntrack->expecting == 0)
849 conntrack->helper = ip_ct_find_helper(newreply);
Patrick McHardye45b1be2005-06-21 14:01:30 -0700850 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700851}
852
853int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
854{
855 BUG_ON(me->timeout == 0);
Patrick McHardye45b1be2005-06-21 14:01:30 -0700856 write_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700857 list_prepend(&helpers, me);
Patrick McHardye45b1be2005-06-21 14:01:30 -0700858 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700859
860 return 0;
861}
862
863static inline int unhelp(struct ip_conntrack_tuple_hash *i,
864 const struct ip_conntrack_helper *me)
865{
866 if (tuplehash_to_ctrack(i)->helper == me)
867 tuplehash_to_ctrack(i)->helper = NULL;
868 return 0;
869}
870
871void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
872{
873 unsigned int i;
874 struct ip_conntrack_expect *exp, *tmp;
875
876 /* Need write lock here, to delete helper. */
Patrick McHardye45b1be2005-06-21 14:01:30 -0700877 write_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700878 LIST_DELETE(&helpers, me);
879
880 /* Get rid of expectations */
881 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
882 if (exp->master->helper == me && del_timer(&exp->timeout)) {
883 unlink_expect(exp);
884 destroy_expect(exp);
885 }
886 }
887 /* Get rid of expecteds, set helpers to NULL. */
888 LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
889 for (i = 0; i < ip_conntrack_htable_size; i++)
890 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
891 struct ip_conntrack_tuple_hash *, me);
Patrick McHardye45b1be2005-06-21 14:01:30 -0700892 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700893
894 /* Someone could be still looking at the helper in a bh. */
895 synchronize_net();
896}
897
898static inline void ct_add_counters(struct ip_conntrack *ct,
899 enum ip_conntrack_info ctinfo,
900 const struct sk_buff *skb)
901{
902#ifdef CONFIG_IP_NF_CT_ACCT
903 if (skb) {
904 ct->counters[CTINFO2DIR(ctinfo)].packets++;
905 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
906 ntohs(skb->nh.iph->tot_len);
907 }
908#endif
909}
910
911/* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
912void ip_ct_refresh_acct(struct ip_conntrack *ct,
913 enum ip_conntrack_info ctinfo,
914 const struct sk_buff *skb,
915 unsigned long extra_jiffies)
916{
917 IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
918
919 /* If not in hash table, timer will not be active yet */
920 if (!is_confirmed(ct)) {
921 ct->timeout.expires = extra_jiffies;
922 ct_add_counters(ct, ctinfo, skb);
923 } else {
Patrick McHardye45b1be2005-06-21 14:01:30 -0700924 write_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700925 /* Need del_timer for race avoidance (may already be dying). */
926 if (del_timer(&ct->timeout)) {
927 ct->timeout.expires = jiffies + extra_jiffies;
928 add_timer(&ct->timeout);
929 }
930 ct_add_counters(ct, ctinfo, skb);
Patrick McHardye45b1be2005-06-21 14:01:30 -0700931 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700932 }
933}
934
935/* Returns new sk_buff, or NULL */
936struct sk_buff *
937ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
938{
David S. Miller8be58932005-05-19 12:36:33 -0700939 skb_orphan(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700940
941 local_bh_disable();
942 skb = ip_defrag(skb, user);
943 local_bh_enable();
944
David S. Miller8be58932005-05-19 12:36:33 -0700945 if (skb) {
946 ip_send_check(skb->nh.iph);
947 skb->nfcache |= NFC_ALTERED;
David S. Miller8be58932005-05-19 12:36:33 -0700948 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700949 return skb;
950}
951
952/* Used by ipt_REJECT. */
953static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
954{
955 struct ip_conntrack *ct;
956 enum ip_conntrack_info ctinfo;
957
958 /* This ICMP is in reverse direction to the packet which caused it */
959 ct = ip_conntrack_get(skb, &ctinfo);
960
961 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
962 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
963 else
964 ctinfo = IP_CT_RELATED;
965
966 /* Attach to new skbuff, and increment count */
967 nskb->nfct = &ct->ct_general;
968 nskb->nfctinfo = ctinfo;
969 nf_conntrack_get(nskb->nfct);
970}
971
972static inline int
973do_iter(const struct ip_conntrack_tuple_hash *i,
974 int (*iter)(struct ip_conntrack *i, void *data),
975 void *data)
976{
977 return iter(tuplehash_to_ctrack(i), data);
978}
979
980/* Bring out ya dead! */
981static struct ip_conntrack_tuple_hash *
982get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
983 void *data, unsigned int *bucket)
984{
985 struct ip_conntrack_tuple_hash *h = NULL;
986
Patrick McHardye45b1be2005-06-21 14:01:30 -0700987 write_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700988 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
989 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
990 struct ip_conntrack_tuple_hash *, iter, data);
991 if (h)
992 break;
993 }
994 if (!h)
995 h = LIST_FIND_W(&unconfirmed, do_iter,
996 struct ip_conntrack_tuple_hash *, iter, data);
997 if (h)
998 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
Patrick McHardye45b1be2005-06-21 14:01:30 -0700999 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001000
1001 return h;
1002}
1003
1004void
1005ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1006{
1007 struct ip_conntrack_tuple_hash *h;
1008 unsigned int bucket = 0;
1009
1010 while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1011 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1012 /* Time to push up daises... */
1013 if (del_timer(&ct->timeout))
1014 death_by_timeout((unsigned long)ct);
1015 /* ... else the timer will get him soon. */
1016
1017 ip_conntrack_put(ct);
1018 }
1019}
1020
1021/* Fast function for those who don't want to parse /proc (and I don't
1022 blame them). */
1023/* Reversing the socket's dst/src point of view gives us the reply
1024 mapping. */
1025static int
1026getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1027{
1028 struct inet_sock *inet = inet_sk(sk);
1029 struct ip_conntrack_tuple_hash *h;
1030 struct ip_conntrack_tuple tuple;
1031
1032 IP_CT_TUPLE_U_BLANK(&tuple);
1033 tuple.src.ip = inet->rcv_saddr;
1034 tuple.src.u.tcp.port = inet->sport;
1035 tuple.dst.ip = inet->daddr;
1036 tuple.dst.u.tcp.port = inet->dport;
1037 tuple.dst.protonum = IPPROTO_TCP;
1038
1039 /* We only do TCP at the moment: is there a better way? */
1040 if (strcmp(sk->sk_prot->name, "TCP")) {
1041 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1042 return -ENOPROTOOPT;
1043 }
1044
1045 if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1046 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1047 *len, sizeof(struct sockaddr_in));
1048 return -EINVAL;
1049 }
1050
1051 h = ip_conntrack_find_get(&tuple, NULL);
1052 if (h) {
1053 struct sockaddr_in sin;
1054 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1055
1056 sin.sin_family = AF_INET;
1057 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1058 .tuple.dst.u.tcp.port;
1059 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1060 .tuple.dst.ip;
1061
1062 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1063 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1064 ip_conntrack_put(ct);
1065 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1066 return -EFAULT;
1067 else
1068 return 0;
1069 }
1070 DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1071 NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1072 NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1073 return -ENOENT;
1074}
1075
1076static struct nf_sockopt_ops so_getorigdst = {
1077 .pf = PF_INET,
1078 .get_optmin = SO_ORIGINAL_DST,
1079 .get_optmax = SO_ORIGINAL_DST+1,
1080 .get = &getorigdst,
1081};
1082
1083static int kill_all(struct ip_conntrack *i, void *data)
1084{
1085 return 1;
1086}
1087
1088static void free_conntrack_hash(void)
1089{
1090 if (ip_conntrack_vmalloc)
1091 vfree(ip_conntrack_hash);
1092 else
1093 free_pages((unsigned long)ip_conntrack_hash,
1094 get_order(sizeof(struct list_head)
1095 * ip_conntrack_htable_size));
1096}
1097
1098/* Mishearing the voices in his head, our hero wonders how he's
1099 supposed to kill the mall. */
1100void ip_conntrack_cleanup(void)
1101{
1102 ip_ct_attach = NULL;
1103 /* This makes sure all current packets have passed through
1104 netfilter framework. Roll on, two-stage module
1105 delete... */
1106 synchronize_net();
1107
1108 i_see_dead_people:
1109 ip_ct_iterate_cleanup(kill_all, NULL);
1110 if (atomic_read(&ip_conntrack_count) != 0) {
1111 schedule();
1112 goto i_see_dead_people;
1113 }
1114
1115 kmem_cache_destroy(ip_conntrack_cachep);
1116 kmem_cache_destroy(ip_conntrack_expect_cachep);
1117 free_conntrack_hash();
1118 nf_unregister_sockopt(&so_getorigdst);
1119}
1120
1121static int hashsize;
1122module_param(hashsize, int, 0400);
1123
1124int __init ip_conntrack_init(void)
1125{
1126 unsigned int i;
1127 int ret;
1128
1129 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1130 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
1131 if (hashsize) {
1132 ip_conntrack_htable_size = hashsize;
1133 } else {
1134 ip_conntrack_htable_size
1135 = (((num_physpages << PAGE_SHIFT) / 16384)
1136 / sizeof(struct list_head));
1137 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1138 ip_conntrack_htable_size = 8192;
1139 if (ip_conntrack_htable_size < 16)
1140 ip_conntrack_htable_size = 16;
1141 }
1142 ip_conntrack_max = 8 * ip_conntrack_htable_size;
1143
1144 printk("ip_conntrack version %s (%u buckets, %d max)"
1145 " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1146 ip_conntrack_htable_size, ip_conntrack_max,
1147 sizeof(struct ip_conntrack));
1148
1149 ret = nf_register_sockopt(&so_getorigdst);
1150 if (ret != 0) {
1151 printk(KERN_ERR "Unable to register netfilter socket option\n");
1152 return ret;
1153 }
1154
1155 /* AK: the hash table is twice as big than needed because it
1156 uses list_head. it would be much nicer to caches to use a
1157 single pointer list head here. */
1158 ip_conntrack_vmalloc = 0;
1159 ip_conntrack_hash
1160 =(void*)__get_free_pages(GFP_KERNEL,
1161 get_order(sizeof(struct list_head)
1162 *ip_conntrack_htable_size));
1163 if (!ip_conntrack_hash) {
1164 ip_conntrack_vmalloc = 1;
1165 printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n");
1166 ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1167 * ip_conntrack_htable_size);
1168 }
1169 if (!ip_conntrack_hash) {
1170 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1171 goto err_unreg_sockopt;
1172 }
1173
1174 ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1175 sizeof(struct ip_conntrack), 0,
1176 0, NULL, NULL);
1177 if (!ip_conntrack_cachep) {
1178 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1179 goto err_free_hash;
1180 }
1181
1182 ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1183 sizeof(struct ip_conntrack_expect),
1184 0, 0, NULL, NULL);
1185 if (!ip_conntrack_expect_cachep) {
1186 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1187 goto err_free_conntrack_slab;
1188 }
1189
1190 /* Don't NEED lock here, but good form anyway. */
Patrick McHardye45b1be2005-06-21 14:01:30 -07001191 write_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001192 for (i = 0; i < MAX_IP_CT_PROTO; i++)
1193 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1194 /* Sew in builtin protocols. */
1195 ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1196 ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1197 ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
Patrick McHardye45b1be2005-06-21 14:01:30 -07001198 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001199
1200 for (i = 0; i < ip_conntrack_htable_size; i++)
1201 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1202
1203 /* For use by ipt_REJECT */
1204 ip_ct_attach = ip_conntrack_attach;
1205
1206 /* Set up fake conntrack:
1207 - to never be deleted, not in any hashes */
1208 atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1209 /* - and look it like as a confirmed connection */
1210 set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1211
1212 return ret;
1213
1214err_free_conntrack_slab:
1215 kmem_cache_destroy(ip_conntrack_cachep);
1216err_free_hash:
1217 free_conntrack_hash();
1218err_unreg_sockopt:
1219 nf_unregister_sockopt(&so_getorigdst);
1220
1221 return -ENOMEM;
1222}