blob: ceaabc18202b71e8b26c7bdbb23fa08fc0c9a780 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/* Connection state tracking for netfilter. This is separated from,
2 but required by, the NAT layer; it can also be used by an iptables
3 extension. */
4
5/* (C) 1999-2001 Paul `Rusty' Russell
6 * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 *
12 * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13 * - new API and handling of conntrack/nat helpers
14 * - now capable of multiple expectations for one master
15 * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16 * - add usage/reference counts to ip_conntrack_expect
17 * - export ip_conntrack[_expect]_{find_get,put} functions
18 * */
19
20#include <linux/config.h>
21#include <linux/types.h>
22#include <linux/icmp.h>
23#include <linux/ip.h>
24#include <linux/netfilter.h>
25#include <linux/netfilter_ipv4.h>
26#include <linux/module.h>
27#include <linux/skbuff.h>
28#include <linux/proc_fs.h>
29#include <linux/vmalloc.h>
30#include <net/checksum.h>
31#include <net/ip.h>
32#include <linux/stddef.h>
33#include <linux/sysctl.h>
34#include <linux/slab.h>
35#include <linux/random.h>
36#include <linux/jhash.h>
37#include <linux/err.h>
38#include <linux/percpu.h>
39#include <linux/moduleparam.h>
Harald Welteac3247b2005-08-09 19:28:03 -070040#include <linux/notifier.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070041
Patrick McHardye45b1be2005-06-21 14:01:30 -070042/* ip_conntrack_lock protects the main hash table, protocol/helper/expected
Linus Torvalds1da177e2005-04-16 15:20:36 -070043 registrations, conntrack timers*/
Patrick McHardye45b1be2005-06-21 14:01:30 -070044#define ASSERT_READ_LOCK(x)
45#define ASSERT_WRITE_LOCK(x)
Linus Torvalds1da177e2005-04-16 15:20:36 -070046
47#include <linux/netfilter_ipv4/ip_conntrack.h>
48#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
49#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
50#include <linux/netfilter_ipv4/ip_conntrack_core.h>
51#include <linux/netfilter_ipv4/listhelp.h>
52
Harald Welteeed75f12005-10-16 14:22:59 +020053#define IP_CONNTRACK_VERSION "2.4"
Linus Torvalds1da177e2005-04-16 15:20:36 -070054
55#if 0
56#define DEBUGP printk
57#else
58#define DEBUGP(format, args...)
59#endif
60
Patrick McHardye45b1be2005-06-21 14:01:30 -070061DEFINE_RWLOCK(ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -070062
63/* ip_conntrack_standalone needs this */
64atomic_t ip_conntrack_count = ATOMIC_INIT(0);
65
66void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
67LIST_HEAD(ip_conntrack_expect_list);
68struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
69static LIST_HEAD(helpers);
70unsigned int ip_conntrack_htable_size = 0;
71int ip_conntrack_max;
72struct list_head *ip_conntrack_hash;
Eric Dumazetba899662005-08-26 12:05:31 -070073static kmem_cache_t *ip_conntrack_cachep __read_mostly;
74static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070075struct ip_conntrack ip_conntrack_untracked;
76unsigned int ip_ct_log_invalid;
77static LIST_HEAD(unconfirmed);
78static int ip_conntrack_vmalloc;
79
Pablo Neira Ayuso4e3882f2006-03-22 13:55:11 -080080static unsigned int ip_conntrack_next_id;
81static unsigned int ip_conntrack_expect_next_id;
Harald Welteac3247b2005-08-09 19:28:03 -070082#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
Alan Sterne041c682006-03-27 01:16:30 -080083ATOMIC_NOTIFIER_HEAD(ip_conntrack_chain);
84ATOMIC_NOTIFIER_HEAD(ip_conntrack_expect_chain);
Harald Welteac3247b2005-08-09 19:28:03 -070085
86DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
87
Patrick McHardya86888b2005-08-09 20:02:13 -070088/* deliver cached events and clear cache entry - must be called with locally
89 * disabled softirqs */
90static inline void
91__ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
Harald Welteac3247b2005-08-09 19:28:03 -070092{
Patrick McHardya86888b2005-08-09 20:02:13 -070093 DEBUGP("ecache: delivering events for %p\n", ecache->ct);
Harald Welteac3247b2005-08-09 19:28:03 -070094 if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
Alan Sterne041c682006-03-27 01:16:30 -080095 atomic_notifier_call_chain(&ip_conntrack_chain, ecache->events,
Harald Welteac3247b2005-08-09 19:28:03 -070096 ecache->ct);
97 ecache->events = 0;
Patrick McHardya86888b2005-08-09 20:02:13 -070098 ip_conntrack_put(ecache->ct);
99 ecache->ct = NULL;
Harald Welteac3247b2005-08-09 19:28:03 -0700100}
101
102/* Deliver all cached events for a particular conntrack. This is called
103 * by code prior to async packet handling or freeing the skb */
Patrick McHardya86888b2005-08-09 20:02:13 -0700104void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
Harald Welteac3247b2005-08-09 19:28:03 -0700105{
Patrick McHardya86888b2005-08-09 20:02:13 -0700106 struct ip_conntrack_ecache *ecache;
107
108 local_bh_disable();
109 ecache = &__get_cpu_var(ip_conntrack_ecache);
110 if (ecache->ct == ct)
111 __ip_ct_deliver_cached_events(ecache);
112 local_bh_enable();
Harald Welteac3247b2005-08-09 19:28:03 -0700113}
114
Patrick McHardya86888b2005-08-09 20:02:13 -0700115void __ip_ct_event_cache_init(struct ip_conntrack *ct)
Harald Welteac3247b2005-08-09 19:28:03 -0700116{
Patrick McHardya86888b2005-08-09 20:02:13 -0700117 struct ip_conntrack_ecache *ecache;
Harald Welteac3247b2005-08-09 19:28:03 -0700118
119 /* take care of delivering potentially old events */
Patrick McHardya86888b2005-08-09 20:02:13 -0700120 ecache = &__get_cpu_var(ip_conntrack_ecache);
121 BUG_ON(ecache->ct == ct);
122 if (ecache->ct)
123 __ip_ct_deliver_cached_events(ecache);
124 /* initialize for this conntrack/packet */
125 ecache->ct = ct;
126 nf_conntrack_get(&ct->ct_general);
Harald Welteac3247b2005-08-09 19:28:03 -0700127}
128
Patrick McHardya86888b2005-08-09 20:02:13 -0700129/* flush the event cache - touches other CPU's data and must not be called while
130 * packets are still passing through the code */
131static void ip_ct_event_cache_flush(void)
132{
133 struct ip_conntrack_ecache *ecache;
134 int cpu;
135
136 for_each_cpu(cpu) {
137 ecache = &per_cpu(ip_conntrack_ecache, cpu);
138 if (ecache->ct)
139 ip_conntrack_put(ecache->ct);
140 }
141}
142#else
143static inline void ip_ct_event_cache_flush(void) {}
Harald Welteac3247b2005-08-09 19:28:03 -0700144#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
145
Linus Torvalds1da177e2005-04-16 15:20:36 -0700146DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
147
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148static int ip_conntrack_hash_rnd_initted;
149static unsigned int ip_conntrack_hash_rnd;
150
Harald Welteeed75f12005-10-16 14:22:59 +0200151static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple,
152 unsigned int size, unsigned int rnd)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700153{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700154 return (jhash_3words(tuple->src.ip,
155 (tuple->dst.ip ^ tuple->dst.protonum),
156 (tuple->src.u.all | (tuple->dst.u.all << 16)),
Harald Welteeed75f12005-10-16 14:22:59 +0200157 rnd) % size);
158}
159
160static u_int32_t
161hash_conntrack(const struct ip_conntrack_tuple *tuple)
162{
163 return __hash_conntrack(tuple, ip_conntrack_htable_size,
164 ip_conntrack_hash_rnd);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700165}
166
167int
168ip_ct_get_tuple(const struct iphdr *iph,
169 const struct sk_buff *skb,
170 unsigned int dataoff,
171 struct ip_conntrack_tuple *tuple,
172 const struct ip_conntrack_protocol *protocol)
173{
174 /* Never happen */
175 if (iph->frag_off & htons(IP_OFFSET)) {
176 printk("ip_conntrack_core: Frag of proto %u.\n",
177 iph->protocol);
178 return 0;
179 }
180
181 tuple->src.ip = iph->saddr;
182 tuple->dst.ip = iph->daddr;
183 tuple->dst.protonum = iph->protocol;
184 tuple->dst.dir = IP_CT_DIR_ORIGINAL;
185
186 return protocol->pkt_to_tuple(skb, dataoff, tuple);
187}
188
189int
190ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
191 const struct ip_conntrack_tuple *orig,
192 const struct ip_conntrack_protocol *protocol)
193{
194 inverse->src.ip = orig->dst.ip;
195 inverse->dst.ip = orig->src.ip;
196 inverse->dst.protonum = orig->dst.protonum;
197 inverse->dst.dir = !orig->dst.dir;
198
199 return protocol->invert_tuple(inverse, orig);
200}
201
202
203/* ip_conntrack_expect helper functions */
Pablo Neira Ayuso49719eb2005-09-06 15:10:46 -0700204void ip_ct_unlink_expect(struct ip_conntrack_expect *exp)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700205{
Patrick McHardye45b1be2005-06-21 14:01:30 -0700206 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
Rusty Russell4acdbdb2005-07-21 13:14:46 -0700207 IP_NF_ASSERT(!timer_pending(&exp->timeout));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700208 list_del(&exp->list);
Rusty Russell4acdbdb2005-07-21 13:14:46 -0700209 CONNTRACK_STAT_INC(expect_delete);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700210 exp->master->expecting--;
Pablo Neira Ayuso37012f72005-08-09 20:06:11 -0700211 ip_conntrack_expect_put(exp);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700212}
213
214static void expectation_timed_out(unsigned long ul_expect)
215{
216 struct ip_conntrack_expect *exp = (void *)ul_expect;
217
Patrick McHardye45b1be2005-06-21 14:01:30 -0700218 write_lock_bh(&ip_conntrack_lock);
Pablo Neira Ayuso49719eb2005-09-06 15:10:46 -0700219 ip_ct_unlink_expect(exp);
Patrick McHardye45b1be2005-06-21 14:01:30 -0700220 write_unlock_bh(&ip_conntrack_lock);
Rusty Russell4acdbdb2005-07-21 13:14:46 -0700221 ip_conntrack_expect_put(exp);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700222}
223
Harald Welte080774a2005-08-09 19:32:58 -0700224struct ip_conntrack_expect *
225__ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
226{
227 struct ip_conntrack_expect *i;
228
229 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
230 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
231 atomic_inc(&i->use);
232 return i;
233 }
234 }
235 return NULL;
236}
237
238/* Just find a expectation corresponding to a tuple. */
239struct ip_conntrack_expect *
Patrick McHardya41bc002005-09-19 15:35:31 -0700240ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
Harald Welte080774a2005-08-09 19:32:58 -0700241{
242 struct ip_conntrack_expect *i;
243
244 read_lock_bh(&ip_conntrack_lock);
245 i = __ip_conntrack_expect_find(tuple);
246 read_unlock_bh(&ip_conntrack_lock);
247
248 return i;
249}
250
Linus Torvalds1da177e2005-04-16 15:20:36 -0700251/* If an expectation for this connection is found, it gets delete from
252 * global list then returned. */
253static struct ip_conntrack_expect *
254find_expectation(const struct ip_conntrack_tuple *tuple)
255{
256 struct ip_conntrack_expect *i;
257
258 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
259 /* If master is not in hash table yet (ie. packet hasn't left
260 this machine yet), how can other end know about expected?
261 Hence these are not the droids you are looking for (if
262 master ct never got confirmed, we'd hold a reference to it
263 and weird things would happen to future packets). */
264 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
Patrick McHardy2248bcf2005-09-06 15:06:42 -0700265 && is_confirmed(i->master)) {
266 if (i->flags & IP_CT_EXPECT_PERMANENT) {
267 atomic_inc(&i->use);
268 return i;
269 } else if (del_timer(&i->timeout)) {
Pablo Neira Ayuso49719eb2005-09-06 15:10:46 -0700270 ip_ct_unlink_expect(i);
Patrick McHardy2248bcf2005-09-06 15:06:42 -0700271 return i;
272 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700273 }
274 }
275 return NULL;
276}
277
278/* delete all expectations for this conntrack */
Harald Welte080774a2005-08-09 19:32:58 -0700279void ip_ct_remove_expectations(struct ip_conntrack *ct)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700280{
281 struct ip_conntrack_expect *i, *tmp;
282
283 /* Optimization: most connection never expect any others. */
284 if (ct->expecting == 0)
285 return;
286
287 list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
288 if (i->master == ct && del_timer(&i->timeout)) {
Pablo Neira Ayuso49719eb2005-09-06 15:10:46 -0700289 ip_ct_unlink_expect(i);
Rusty Russell4acdbdb2005-07-21 13:14:46 -0700290 ip_conntrack_expect_put(i);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700291 }
292 }
293}
294
295static void
296clean_from_lists(struct ip_conntrack *ct)
297{
298 unsigned int ho, hr;
299
300 DEBUGP("clean_from_lists(%p)\n", ct);
Patrick McHardye45b1be2005-06-21 14:01:30 -0700301 ASSERT_WRITE_LOCK(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700302
303 ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
304 hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
305 LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
306 LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
307
308 /* Destroy all pending expectations */
Harald Welte080774a2005-08-09 19:32:58 -0700309 ip_ct_remove_expectations(ct);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700310}
311
312static void
313destroy_conntrack(struct nf_conntrack *nfct)
314{
315 struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
316 struct ip_conntrack_protocol *proto;
317
318 DEBUGP("destroy_conntrack(%p)\n", ct);
319 IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
320 IP_NF_ASSERT(!timer_pending(&ct->timeout));
321
Pablo Neira Ayuso14a50bb2005-08-09 20:05:52 -0700322 ip_conntrack_event(IPCT_DESTROY, ct);
Harald Welteac3247b2005-08-09 19:28:03 -0700323 set_bit(IPS_DYING_BIT, &ct->status);
324
Linus Torvalds1da177e2005-04-16 15:20:36 -0700325 /* To make sure we don't get any weird locking issues here:
326 * destroy_conntrack() MUST NOT be called with a write lock
327 * to ip_conntrack_lock!!! -HW */
Harald Welte080774a2005-08-09 19:32:58 -0700328 proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700329 if (proto && proto->destroy)
330 proto->destroy(ct);
331
332 if (ip_conntrack_destroyed)
333 ip_conntrack_destroyed(ct);
334
Patrick McHardye45b1be2005-06-21 14:01:30 -0700335 write_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700336 /* Expectations will have been removed in clean_from_lists,
337 * except TFTP can create an expectation on the first packet,
338 * before connection is in the list, so we need to clean here,
339 * too. */
Harald Welte080774a2005-08-09 19:32:58 -0700340 ip_ct_remove_expectations(ct);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700341
342 /* We overload first tuple to link into unconfirmed list. */
343 if (!is_confirmed(ct)) {
344 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
345 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
346 }
347
348 CONNTRACK_STAT_INC(delete);
Patrick McHardye45b1be2005-06-21 14:01:30 -0700349 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700350
351 if (ct->master)
352 ip_conntrack_put(ct->master);
353
354 DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
Harald Welte080774a2005-08-09 19:32:58 -0700355 ip_conntrack_free(ct);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700356}
357
358static void death_by_timeout(unsigned long ul_conntrack)
359{
360 struct ip_conntrack *ct = (void *)ul_conntrack;
361
Patrick McHardye45b1be2005-06-21 14:01:30 -0700362 write_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700363 /* Inside lock so preempt is disabled on module removal path.
364 * Otherwise we can get spurious warnings. */
365 CONNTRACK_STAT_INC(delete_list);
366 clean_from_lists(ct);
Patrick McHardye45b1be2005-06-21 14:01:30 -0700367 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700368 ip_conntrack_put(ct);
369}
370
371static inline int
372conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
373 const struct ip_conntrack_tuple *tuple,
374 const struct ip_conntrack *ignored_conntrack)
375{
Patrick McHardye45b1be2005-06-21 14:01:30 -0700376 ASSERT_READ_LOCK(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700377 return tuplehash_to_ctrack(i) != ignored_conntrack
378 && ip_ct_tuple_equal(tuple, &i->tuple);
379}
380
Harald Welte080774a2005-08-09 19:32:58 -0700381struct ip_conntrack_tuple_hash *
Linus Torvalds1da177e2005-04-16 15:20:36 -0700382__ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
383 const struct ip_conntrack *ignored_conntrack)
384{
385 struct ip_conntrack_tuple_hash *h;
386 unsigned int hash = hash_conntrack(tuple);
387
Patrick McHardye45b1be2005-06-21 14:01:30 -0700388 ASSERT_READ_LOCK(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700389 list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
390 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
391 CONNTRACK_STAT_INC(found);
392 return h;
393 }
394 CONNTRACK_STAT_INC(searched);
395 }
396
397 return NULL;
398}
399
400/* Find a connection corresponding to a tuple. */
401struct ip_conntrack_tuple_hash *
402ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
403 const struct ip_conntrack *ignored_conntrack)
404{
405 struct ip_conntrack_tuple_hash *h;
406
Patrick McHardye45b1be2005-06-21 14:01:30 -0700407 read_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700408 h = __ip_conntrack_find(tuple, ignored_conntrack);
409 if (h)
410 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
Patrick McHardye45b1be2005-06-21 14:01:30 -0700411 read_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700412
413 return h;
414}
415
Harald Welte080774a2005-08-09 19:32:58 -0700416static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
417 unsigned int hash,
418 unsigned int repl_hash)
419{
420 ct->id = ++ip_conntrack_next_id;
421 list_prepend(&ip_conntrack_hash[hash],
422 &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
423 list_prepend(&ip_conntrack_hash[repl_hash],
424 &ct->tuplehash[IP_CT_DIR_REPLY].list);
425}
426
427void ip_conntrack_hash_insert(struct ip_conntrack *ct)
428{
429 unsigned int hash, repl_hash;
430
431 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
432 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
433
434 write_lock_bh(&ip_conntrack_lock);
435 __ip_conntrack_hash_insert(ct, hash, repl_hash);
436 write_unlock_bh(&ip_conntrack_lock);
437}
438
Linus Torvalds1da177e2005-04-16 15:20:36 -0700439/* Confirm a connection given skb; places it in hash table */
440int
441__ip_conntrack_confirm(struct sk_buff **pskb)
442{
443 unsigned int hash, repl_hash;
444 struct ip_conntrack *ct;
445 enum ip_conntrack_info ctinfo;
446
447 ct = ip_conntrack_get(*pskb, &ctinfo);
448
449 /* ipt_REJECT uses ip_conntrack_attach to attach related
450 ICMP/TCP RST packets in other direction. Actual packet
451 which created connection will be IP_CT_NEW or for an
452 expected connection, IP_CT_RELATED. */
453 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
454 return NF_ACCEPT;
455
456 hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
457 repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
458
459 /* We're not in hash table, and we refuse to set up related
460 connections for unconfirmed conns. But packet copies and
461 REJECT will give spurious warnings here. */
462 /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
463
464 /* No external references means noone else could have
465 confirmed us. */
466 IP_NF_ASSERT(!is_confirmed(ct));
467 DEBUGP("Confirming conntrack %p\n", ct);
468
Patrick McHardye45b1be2005-06-21 14:01:30 -0700469 write_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700470
471 /* See if there's one in the list already, including reverse:
472 NAT could have grabbed it without realizing, since we're
473 not in the hash. If there is, we lost race. */
474 if (!LIST_FIND(&ip_conntrack_hash[hash],
475 conntrack_tuple_cmp,
476 struct ip_conntrack_tuple_hash *,
477 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
478 && !LIST_FIND(&ip_conntrack_hash[repl_hash],
479 conntrack_tuple_cmp,
480 struct ip_conntrack_tuple_hash *,
481 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
482 /* Remove from unconfirmed list */
483 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
484
Harald Welte080774a2005-08-09 19:32:58 -0700485 __ip_conntrack_hash_insert(ct, hash, repl_hash);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700486 /* Timer relative to confirmation time, not original
487 setting time, otherwise we'd get timer wrap in
488 weird delay cases. */
489 ct->timeout.expires += jiffies;
490 add_timer(&ct->timeout);
491 atomic_inc(&ct->ct_general.use);
492 set_bit(IPS_CONFIRMED_BIT, &ct->status);
493 CONNTRACK_STAT_INC(insert);
Patrick McHardye45b1be2005-06-21 14:01:30 -0700494 write_unlock_bh(&ip_conntrack_lock);
Harald Welteac3247b2005-08-09 19:28:03 -0700495 if (ct->helper)
496 ip_conntrack_event_cache(IPCT_HELPER, *pskb);
497#ifdef CONFIG_IP_NF_NAT_NEEDED
498 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
499 test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
500 ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
501#endif
502 ip_conntrack_event_cache(master_ct(ct) ?
503 IPCT_RELATED : IPCT_NEW, *pskb);
504
Linus Torvalds1da177e2005-04-16 15:20:36 -0700505 return NF_ACCEPT;
506 }
507
508 CONNTRACK_STAT_INC(insert_failed);
Patrick McHardye45b1be2005-06-21 14:01:30 -0700509 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700510
511 return NF_DROP;
512}
513
514/* Returns true if a connection correspondings to the tuple (required
515 for NAT). */
516int
517ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
518 const struct ip_conntrack *ignored_conntrack)
519{
520 struct ip_conntrack_tuple_hash *h;
521
Patrick McHardye45b1be2005-06-21 14:01:30 -0700522 read_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700523 h = __ip_conntrack_find(tuple, ignored_conntrack);
Patrick McHardye45b1be2005-06-21 14:01:30 -0700524 read_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700525
526 return h != NULL;
527}
528
529/* There's a small race here where we may free a just-assured
530 connection. Too bad: we're in trouble anyway. */
531static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
532{
533 return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
534}
535
536static int early_drop(struct list_head *chain)
537{
538 /* Traverse backwards: gives us oldest, which is roughly LRU */
539 struct ip_conntrack_tuple_hash *h;
540 struct ip_conntrack *ct = NULL;
541 int dropped = 0;
542
Patrick McHardye45b1be2005-06-21 14:01:30 -0700543 read_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700544 h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
545 if (h) {
546 ct = tuplehash_to_ctrack(h);
547 atomic_inc(&ct->ct_general.use);
548 }
Patrick McHardye45b1be2005-06-21 14:01:30 -0700549 read_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700550
551 if (!ct)
552 return dropped;
553
554 if (del_timer(&ct->timeout)) {
555 death_by_timeout((unsigned long)ct);
556 dropped = 1;
557 CONNTRACK_STAT_INC(early_drop);
558 }
559 ip_conntrack_put(ct);
560 return dropped;
561}
562
563static inline int helper_cmp(const struct ip_conntrack_helper *i,
564 const struct ip_conntrack_tuple *rtuple)
565{
566 return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
567}
568
Harald Welte080774a2005-08-09 19:32:58 -0700569static struct ip_conntrack_helper *
570__ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700571{
572 return LIST_FIND(&helpers, helper_cmp,
573 struct ip_conntrack_helper *,
574 tuple);
575}
576
Harald Welte080774a2005-08-09 19:32:58 -0700577struct ip_conntrack_helper *
578ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
579{
580 struct ip_conntrack_helper *helper;
581
582 /* need ip_conntrack_lock to assure that helper exists until
583 * try_module_get() is called */
584 read_lock_bh(&ip_conntrack_lock);
585
586 helper = __ip_conntrack_helper_find(tuple);
587 if (helper) {
588 /* need to increase module usage count to assure helper will
589 * not go away while the caller is e.g. busy putting a
590 * conntrack in the hash that uses the helper */
591 if (!try_module_get(helper->me))
592 helper = NULL;
593 }
594
595 read_unlock_bh(&ip_conntrack_lock);
596
597 return helper;
598}
599
600void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
601{
602 module_put(helper->me);
603}
604
605struct ip_conntrack_protocol *
606__ip_conntrack_proto_find(u_int8_t protocol)
607{
608 return ip_ct_protos[protocol];
609}
610
611/* this is guaranteed to always return a valid protocol helper, since
612 * it falls back to generic_protocol */
613struct ip_conntrack_protocol *
614ip_conntrack_proto_find_get(u_int8_t protocol)
615{
616 struct ip_conntrack_protocol *p;
617
618 preempt_disable();
619 p = __ip_conntrack_proto_find(protocol);
620 if (p) {
621 if (!try_module_get(p->me))
622 p = &ip_conntrack_generic_protocol;
623 }
624 preempt_enable();
625
626 return p;
627}
628
629void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
630{
631 module_put(p->me);
632}
633
634struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
635 struct ip_conntrack_tuple *repl)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700636{
637 struct ip_conntrack *conntrack;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700638
639 if (!ip_conntrack_hash_rnd_initted) {
640 get_random_bytes(&ip_conntrack_hash_rnd, 4);
641 ip_conntrack_hash_rnd_initted = 1;
642 }
643
Linus Torvalds1da177e2005-04-16 15:20:36 -0700644 if (ip_conntrack_max
645 && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
Harald Welte080774a2005-08-09 19:32:58 -0700646 unsigned int hash = hash_conntrack(orig);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700647 /* Try dropping from this hash chain. */
648 if (!early_drop(&ip_conntrack_hash[hash])) {
649 if (net_ratelimit())
650 printk(KERN_WARNING
651 "ip_conntrack: table full, dropping"
652 " packet.\n");
653 return ERR_PTR(-ENOMEM);
654 }
655 }
656
Linus Torvalds1da177e2005-04-16 15:20:36 -0700657 conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
658 if (!conntrack) {
659 DEBUGP("Can't allocate conntrack.\n");
Yasuyuki Kozakai7663f182005-08-09 20:24:15 -0700660 return ERR_PTR(-ENOMEM);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700661 }
662
663 memset(conntrack, 0, sizeof(*conntrack));
664 atomic_set(&conntrack->ct_general.use, 1);
665 conntrack->ct_general.destroy = destroy_conntrack;
Harald Welte080774a2005-08-09 19:32:58 -0700666 conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
667 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700668 /* Don't set timer yet: wait for confirmation */
669 init_timer(&conntrack->timeout);
670 conntrack->timeout.data = (unsigned long)conntrack;
671 conntrack->timeout.function = death_by_timeout;
672
Harald Welte080774a2005-08-09 19:32:58 -0700673 atomic_inc(&ip_conntrack_count);
674
675 return conntrack;
676}
677
678void
679ip_conntrack_free(struct ip_conntrack *conntrack)
680{
681 atomic_dec(&ip_conntrack_count);
682 kmem_cache_free(ip_conntrack_cachep, conntrack);
683}
684
685/* Allocate a new conntrack: we return -ENOMEM if classification
686 * failed due to stress. Otherwise it really is unclassifiable */
687static struct ip_conntrack_tuple_hash *
688init_conntrack(struct ip_conntrack_tuple *tuple,
689 struct ip_conntrack_protocol *protocol,
690 struct sk_buff *skb)
691{
692 struct ip_conntrack *conntrack;
693 struct ip_conntrack_tuple repl_tuple;
694 struct ip_conntrack_expect *exp;
695
696 if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
697 DEBUGP("Can't invert tuple.\n");
698 return NULL;
699 }
700
Yasuyuki Kozakai7663f182005-08-09 20:24:15 -0700701 conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
702 if (conntrack == NULL || IS_ERR(conntrack))
703 return (struct ip_conntrack_tuple_hash *)conntrack;
Harald Welte080774a2005-08-09 19:32:58 -0700704
705 if (!protocol->new(conntrack, skb)) {
706 ip_conntrack_free(conntrack);
707 return NULL;
708 }
709
Patrick McHardye45b1be2005-06-21 14:01:30 -0700710 write_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700711 exp = find_expectation(tuple);
712
713 if (exp) {
714 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
715 conntrack, exp);
716 /* Welcome, Mr. Bond. We've been expecting you... */
717 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
718 conntrack->master = exp->master;
Nick Sillik7cee4322005-07-27 14:46:03 -0700719#ifdef CONFIG_IP_NF_CONNTRACK_MARK
Linus Torvalds1da177e2005-04-16 15:20:36 -0700720 conntrack->mark = exp->master->mark;
721#endif
Harald Welte1f494c02005-07-30 17:44:07 -0700722#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
723 defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
724 /* this is ugly, but there is no other place where to put it */
725 conntrack->nat.masq_index = exp->master->nat.masq_index;
726#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700727 nf_conntrack_get(&conntrack->master->ct_general);
728 CONNTRACK_STAT_INC(expect_new);
729 } else {
Harald Welte080774a2005-08-09 19:32:58 -0700730 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700731
732 CONNTRACK_STAT_INC(new);
733 }
734
735 /* Overload tuple linked list to put us in unconfirmed list. */
736 list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
737
Patrick McHardye45b1be2005-06-21 14:01:30 -0700738 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700739
740 if (exp) {
741 if (exp->expectfn)
742 exp->expectfn(conntrack, exp);
Rusty Russell4acdbdb2005-07-21 13:14:46 -0700743 ip_conntrack_expect_put(exp);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700744 }
745
746 return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
747}
748
749/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
750static inline struct ip_conntrack *
751resolve_normal_ct(struct sk_buff *skb,
752 struct ip_conntrack_protocol *proto,
753 int *set_reply,
754 unsigned int hooknum,
755 enum ip_conntrack_info *ctinfo)
756{
757 struct ip_conntrack_tuple tuple;
758 struct ip_conntrack_tuple_hash *h;
759 struct ip_conntrack *ct;
760
761 IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
762
763 if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4,
764 &tuple,proto))
765 return NULL;
766
767 /* look for tuple match */
768 h = ip_conntrack_find_get(&tuple, NULL);
769 if (!h) {
770 h = init_conntrack(&tuple, proto, skb);
771 if (!h)
772 return NULL;
773 if (IS_ERR(h))
774 return (void *)h;
775 }
776 ct = tuplehash_to_ctrack(h);
777
778 /* It exists; we have (non-exclusive) reference. */
779 if (DIRECTION(h) == IP_CT_DIR_REPLY) {
780 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
781 /* Please set reply bit if this packet OK */
782 *set_reply = 1;
783 } else {
784 /* Once we've had two way comms, always ESTABLISHED. */
785 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
786 DEBUGP("ip_conntrack_in: normal packet for %p\n",
787 ct);
788 *ctinfo = IP_CT_ESTABLISHED;
789 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
790 DEBUGP("ip_conntrack_in: related packet for %p\n",
791 ct);
792 *ctinfo = IP_CT_RELATED;
793 } else {
794 DEBUGP("ip_conntrack_in: new packet for %p\n",
795 ct);
796 *ctinfo = IP_CT_NEW;
797 }
798 *set_reply = 0;
799 }
800 skb->nfct = &ct->ct_general;
801 skb->nfctinfo = *ctinfo;
802 return ct;
803}
804
805/* Netfilter hook itself. */
806unsigned int ip_conntrack_in(unsigned int hooknum,
807 struct sk_buff **pskb,
808 const struct net_device *in,
809 const struct net_device *out,
810 int (*okfn)(struct sk_buff *))
811{
812 struct ip_conntrack *ct;
813 enum ip_conntrack_info ctinfo;
814 struct ip_conntrack_protocol *proto;
Harald Welteac3247b2005-08-09 19:28:03 -0700815 int set_reply = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700816 int ret;
817
818 /* Previously seen (loopback or untracked)? Ignore. */
819 if ((*pskb)->nfct) {
820 CONNTRACK_STAT_INC(ignore);
821 return NF_ACCEPT;
822 }
823
824 /* Never happen */
825 if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
826 if (net_ratelimit()) {
827 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
828 (*pskb)->nh.iph->protocol, hooknum);
829 }
830 return NF_DROP;
831 }
832
Linus Torvalds1da177e2005-04-16 15:20:36 -0700833/* Doesn't cover locally-generated broadcast, so not worth it. */
834#if 0
835 /* Ignore broadcast: no `connection'. */
836 if ((*pskb)->pkt_type == PACKET_BROADCAST) {
837 printk("Broadcast packet!\n");
838 return NF_ACCEPT;
839 } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
840 == htonl(0x000000FF)) {
841 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
842 NIPQUAD((*pskb)->nh.iph->saddr),
843 NIPQUAD((*pskb)->nh.iph->daddr),
844 (*pskb)->sk, (*pskb)->pkt_type);
845 }
846#endif
847
Harald Welte080774a2005-08-09 19:32:58 -0700848 proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700849
850 /* It may be an special packet, error, unclean...
851 * inverse of the return code tells to the netfilter
852 * core what to do with the packet. */
853 if (proto->error != NULL
854 && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
855 CONNTRACK_STAT_INC(error);
856 CONNTRACK_STAT_INC(invalid);
857 return -ret;
858 }
859
860 if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
861 /* Not valid part of a connection */
862 CONNTRACK_STAT_INC(invalid);
863 return NF_ACCEPT;
864 }
865
866 if (IS_ERR(ct)) {
867 /* Too stressed to deal. */
868 CONNTRACK_STAT_INC(drop);
869 return NF_DROP;
870 }
871
872 IP_NF_ASSERT((*pskb)->nfct);
873
874 ret = proto->packet(ct, *pskb, ctinfo);
875 if (ret < 0) {
876 /* Invalid: inverse of the return code tells
877 * the netfilter core what to do*/
878 nf_conntrack_put((*pskb)->nfct);
879 (*pskb)->nfct = NULL;
880 CONNTRACK_STAT_INC(invalid);
881 return -ret;
882 }
883
Harald Welteac3247b2005-08-09 19:28:03 -0700884 if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
885 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700886
887 return ret;
888}
889
890int invert_tuplepr(struct ip_conntrack_tuple *inverse,
891 const struct ip_conntrack_tuple *orig)
892{
893 return ip_ct_invert_tuple(inverse, orig,
Harald Welte080774a2005-08-09 19:32:58 -0700894 __ip_conntrack_proto_find(orig->dst.protonum));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700895}
896
897/* Would two expected things clash? */
898static inline int expect_clash(const struct ip_conntrack_expect *a,
899 const struct ip_conntrack_expect *b)
900{
901 /* Part covered by intersection of masks must be unequal,
902 otherwise they clash */
903 struct ip_conntrack_tuple intersect_mask
904 = { { a->mask.src.ip & b->mask.src.ip,
905 { a->mask.src.u.all & b->mask.src.u.all } },
906 { a->mask.dst.ip & b->mask.dst.ip,
907 { a->mask.dst.u.all & b->mask.dst.u.all },
908 a->mask.dst.protonum & b->mask.dst.protonum } };
909
910 return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
911}
912
913static inline int expect_matches(const struct ip_conntrack_expect *a,
914 const struct ip_conntrack_expect *b)
915{
916 return a->master == b->master
917 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
918 && ip_ct_tuple_equal(&a->mask, &b->mask);
919}
920
921/* Generally a bad idea to call this: could have matched already. */
922void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
923{
924 struct ip_conntrack_expect *i;
925
Patrick McHardye45b1be2005-06-21 14:01:30 -0700926 write_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700927 /* choose the the oldest expectation to evict */
928 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
929 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
Pablo Neira Ayuso49719eb2005-09-06 15:10:46 -0700930 ip_ct_unlink_expect(i);
Patrick McHardye45b1be2005-06-21 14:01:30 -0700931 write_unlock_bh(&ip_conntrack_lock);
Rusty Russell4acdbdb2005-07-21 13:14:46 -0700932 ip_conntrack_expect_put(i);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700933 return;
934 }
935 }
Patrick McHardye45b1be2005-06-21 14:01:30 -0700936 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700937}
938
Pablo Neira Ayuso91c46e22005-09-06 15:10:23 -0700939/* We don't increase the master conntrack refcount for non-fulfilled
940 * conntracks. During the conntrack destruction, the expectations are
941 * always killed before the conntrack itself */
Rusty Russell4acdbdb2005-07-21 13:14:46 -0700942struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700943{
944 struct ip_conntrack_expect *new;
945
946 new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
947 if (!new) {
948 DEBUGP("expect_related: OOM allocating expect\n");
949 return NULL;
950 }
Rusty Russell4acdbdb2005-07-21 13:14:46 -0700951 new->master = me;
Rusty Russell4acdbdb2005-07-21 13:14:46 -0700952 atomic_set(&new->use, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700953 return new;
954}
955
Rusty Russell4acdbdb2005-07-21 13:14:46 -0700956void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700957{
Pablo Neira Ayuso91c46e22005-09-06 15:10:23 -0700958 if (atomic_dec_and_test(&exp->use))
Rusty Russell4acdbdb2005-07-21 13:14:46 -0700959 kmem_cache_free(ip_conntrack_expect_cachep, exp);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700960}
961
962static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
963{
Rusty Russell4acdbdb2005-07-21 13:14:46 -0700964 atomic_inc(&exp->use);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700965 exp->master->expecting++;
966 list_add(&exp->list, &ip_conntrack_expect_list);
967
Phil Oester1d3cdb42005-06-21 14:02:42 -0700968 init_timer(&exp->timeout);
969 exp->timeout.data = (unsigned long)exp;
970 exp->timeout.function = expectation_timed_out;
971 exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
972 add_timer(&exp->timeout);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700973
Harald Welte080774a2005-08-09 19:32:58 -0700974 exp->id = ++ip_conntrack_expect_next_id;
975 atomic_inc(&exp->use);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700976 CONNTRACK_STAT_INC(expect_create);
977}
978
979/* Race with expectations being used means we could have none to find; OK. */
980static void evict_oldest_expect(struct ip_conntrack *master)
981{
982 struct ip_conntrack_expect *i;
983
984 list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
985 if (i->master == master) {
986 if (del_timer(&i->timeout)) {
Pablo Neira Ayuso49719eb2005-09-06 15:10:46 -0700987 ip_ct_unlink_expect(i);
Rusty Russell4acdbdb2005-07-21 13:14:46 -0700988 ip_conntrack_expect_put(i);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700989 }
990 break;
991 }
992 }
993}
994
995static inline int refresh_timer(struct ip_conntrack_expect *i)
996{
997 if (!del_timer(&i->timeout))
998 return 0;
999
1000 i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
1001 add_timer(&i->timeout);
1002 return 1;
1003}
1004
1005int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
1006{
1007 struct ip_conntrack_expect *i;
1008 int ret;
1009
1010 DEBUGP("ip_conntrack_expect_related %p\n", related_to);
1011 DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
1012 DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
1013
Patrick McHardye45b1be2005-06-21 14:01:30 -07001014 write_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001015 list_for_each_entry(i, &ip_conntrack_expect_list, list) {
1016 if (expect_matches(i, expect)) {
1017 /* Refresh timer: if it's dying, ignore.. */
1018 if (refresh_timer(i)) {
1019 ret = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001020 goto out;
1021 }
1022 } else if (expect_clash(i, expect)) {
1023 ret = -EBUSY;
1024 goto out;
1025 }
1026 }
1027
1028 /* Will be over limit? */
1029 if (expect->master->helper->max_expected &&
1030 expect->master->expecting >= expect->master->helper->max_expected)
1031 evict_oldest_expect(expect->master);
1032
1033 ip_conntrack_expect_insert(expect);
Harald Welteac3247b2005-08-09 19:28:03 -07001034 ip_conntrack_expect_event(IPEXP_NEW, expect);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001035 ret = 0;
1036out:
Patrick McHardye45b1be2005-06-21 14:01:30 -07001037 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001038 return ret;
1039}
1040
1041/* Alter reply tuple (maybe alter helper). This is for NAT, and is
1042 implicitly racy: see __ip_conntrack_confirm */
1043void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1044 const struct ip_conntrack_tuple *newreply)
1045{
Patrick McHardye45b1be2005-06-21 14:01:30 -07001046 write_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001047 /* Should be unconfirmed, so not in hash table yet */
1048 IP_NF_ASSERT(!is_confirmed(conntrack));
1049
1050 DEBUGP("Altering reply tuple of %p to ", conntrack);
1051 DUMP_TUPLE(newreply);
1052
1053 conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1054 if (!conntrack->master && conntrack->expecting == 0)
Harald Welte080774a2005-08-09 19:32:58 -07001055 conntrack->helper = __ip_conntrack_helper_find(newreply);
Patrick McHardye45b1be2005-06-21 14:01:30 -07001056 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001057}
1058
1059int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1060{
1061 BUG_ON(me->timeout == 0);
Patrick McHardye45b1be2005-06-21 14:01:30 -07001062 write_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001063 list_prepend(&helpers, me);
Patrick McHardye45b1be2005-06-21 14:01:30 -07001064 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001065
1066 return 0;
1067}
1068
Harald Welte080774a2005-08-09 19:32:58 -07001069struct ip_conntrack_helper *
1070__ip_conntrack_helper_find_byname(const char *name)
1071{
1072 struct ip_conntrack_helper *h;
1073
1074 list_for_each_entry(h, &helpers, list) {
1075 if (!strcmp(h->name, name))
1076 return h;
1077 }
1078
1079 return NULL;
1080}
1081
Linus Torvalds1da177e2005-04-16 15:20:36 -07001082static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1083 const struct ip_conntrack_helper *me)
1084{
Harald Welteac3247b2005-08-09 19:28:03 -07001085 if (tuplehash_to_ctrack(i)->helper == me) {
1086 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001087 tuplehash_to_ctrack(i)->helper = NULL;
Harald Welteac3247b2005-08-09 19:28:03 -07001088 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001089 return 0;
1090}
1091
1092void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1093{
1094 unsigned int i;
1095 struct ip_conntrack_expect *exp, *tmp;
1096
1097 /* Need write lock here, to delete helper. */
Patrick McHardye45b1be2005-06-21 14:01:30 -07001098 write_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001099 LIST_DELETE(&helpers, me);
1100
1101 /* Get rid of expectations */
1102 list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
1103 if (exp->master->helper == me && del_timer(&exp->timeout)) {
Pablo Neira Ayuso49719eb2005-09-06 15:10:46 -07001104 ip_ct_unlink_expect(exp);
Rusty Russell4acdbdb2005-07-21 13:14:46 -07001105 ip_conntrack_expect_put(exp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001106 }
1107 }
1108 /* Get rid of expecteds, set helpers to NULL. */
1109 LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
1110 for (i = 0; i < ip_conntrack_htable_size; i++)
1111 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1112 struct ip_conntrack_tuple_hash *, me);
Patrick McHardye45b1be2005-06-21 14:01:30 -07001113 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001114
1115 /* Someone could be still looking at the helper in a bh. */
1116 synchronize_net();
1117}
1118
Harald Welte1dfbab52005-09-22 23:46:57 -07001119/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1120void __ip_ct_refresh_acct(struct ip_conntrack *ct,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001121 enum ip_conntrack_info ctinfo,
1122 const struct sk_buff *skb,
Harald Welte1dfbab52005-09-22 23:46:57 -07001123 unsigned long extra_jiffies,
1124 int do_acct)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001125{
Harald Weltea051a8f2005-10-10 21:21:10 -07001126 int event = 0;
Harald Welte1dfbab52005-09-22 23:46:57 -07001127
Linus Torvalds1da177e2005-04-16 15:20:36 -07001128 IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
Harald Welte1dfbab52005-09-22 23:46:57 -07001129 IP_NF_ASSERT(skb);
1130
1131 write_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001132
1133 /* If not in hash table, timer will not be active yet */
1134 if (!is_confirmed(ct)) {
1135 ct->timeout.expires = extra_jiffies;
Harald Weltea051a8f2005-10-10 21:21:10 -07001136 event = IPCT_REFRESH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001137 } else {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001138 /* Need del_timer for race avoidance (may already be dying). */
1139 if (del_timer(&ct->timeout)) {
1140 ct->timeout.expires = jiffies + extra_jiffies;
1141 add_timer(&ct->timeout);
Harald Weltea051a8f2005-10-10 21:21:10 -07001142 event = IPCT_REFRESH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001143 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001144 }
Harald Welte1dfbab52005-09-22 23:46:57 -07001145
1146#ifdef CONFIG_IP_NF_CT_ACCT
1147 if (do_acct) {
1148 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1149 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1150 ntohs(skb->nh.iph->tot_len);
Harald Weltea051a8f2005-10-10 21:21:10 -07001151 if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1152 || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1153 event |= IPCT_COUNTER_FILLING;
Harald Welte1dfbab52005-09-22 23:46:57 -07001154 }
1155#endif
1156
1157 write_unlock_bh(&ip_conntrack_lock);
1158
1159 /* must be unlocked when calling event cache */
Harald Weltea051a8f2005-10-10 21:21:10 -07001160 if (event)
1161 ip_conntrack_event_cache(event, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001162}
1163
Harald Welte080774a2005-08-09 19:32:58 -07001164#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1165 defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1166/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1167 * in ip_conntrack_core, since we don't want the protocols to autoload
1168 * or depend on ctnetlink */
1169int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1170 const struct ip_conntrack_tuple *tuple)
1171{
1172 NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1173 &tuple->src.u.tcp.port);
1174 NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1175 &tuple->dst.u.tcp.port);
1176 return 0;
1177
1178nfattr_failure:
1179 return -1;
1180}
1181
1182int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1183 struct ip_conntrack_tuple *t)
1184{
1185 if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1186 return -EINVAL;
1187
1188 t->src.u.tcp.port =
1189 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1190 t->dst.u.tcp.port =
1191 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1192
1193 return 0;
1194}
1195#endif
1196
Linus Torvalds1da177e2005-04-16 15:20:36 -07001197/* Returns new sk_buff, or NULL */
1198struct sk_buff *
1199ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1200{
David S. Miller8be58932005-05-19 12:36:33 -07001201 skb_orphan(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001202
1203 local_bh_disable();
1204 skb = ip_defrag(skb, user);
1205 local_bh_enable();
1206
Harald Welte6869c4d2005-08-09 19:24:19 -07001207 if (skb)
David S. Miller8be58932005-05-19 12:36:33 -07001208 ip_send_check(skb->nh.iph);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001209 return skb;
1210}
1211
1212/* Used by ipt_REJECT. */
1213static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1214{
1215 struct ip_conntrack *ct;
1216 enum ip_conntrack_info ctinfo;
1217
1218 /* This ICMP is in reverse direction to the packet which caused it */
1219 ct = ip_conntrack_get(skb, &ctinfo);
1220
1221 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1222 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1223 else
1224 ctinfo = IP_CT_RELATED;
1225
1226 /* Attach to new skbuff, and increment count */
1227 nskb->nfct = &ct->ct_general;
1228 nskb->nfctinfo = ctinfo;
1229 nf_conntrack_get(nskb->nfct);
1230}
1231
1232static inline int
1233do_iter(const struct ip_conntrack_tuple_hash *i,
1234 int (*iter)(struct ip_conntrack *i, void *data),
1235 void *data)
1236{
1237 return iter(tuplehash_to_ctrack(i), data);
1238}
1239
1240/* Bring out ya dead! */
1241static struct ip_conntrack_tuple_hash *
1242get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1243 void *data, unsigned int *bucket)
1244{
1245 struct ip_conntrack_tuple_hash *h = NULL;
1246
Patrick McHardye45b1be2005-06-21 14:01:30 -07001247 write_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001248 for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1249 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
1250 struct ip_conntrack_tuple_hash *, iter, data);
1251 if (h)
1252 break;
1253 }
1254 if (!h)
1255 h = LIST_FIND_W(&unconfirmed, do_iter,
1256 struct ip_conntrack_tuple_hash *, iter, data);
1257 if (h)
1258 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
Patrick McHardye45b1be2005-06-21 14:01:30 -07001259 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001260
1261 return h;
1262}
1263
1264void
1265ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1266{
1267 struct ip_conntrack_tuple_hash *h;
1268 unsigned int bucket = 0;
1269
1270 while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1271 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1272 /* Time to push up daises... */
1273 if (del_timer(&ct->timeout))
1274 death_by_timeout((unsigned long)ct);
1275 /* ... else the timer will get him soon. */
1276
1277 ip_conntrack_put(ct);
1278 }
1279}
1280
1281/* Fast function for those who don't want to parse /proc (and I don't
1282 blame them). */
1283/* Reversing the socket's dst/src point of view gives us the reply
1284 mapping. */
1285static int
1286getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1287{
1288 struct inet_sock *inet = inet_sk(sk);
1289 struct ip_conntrack_tuple_hash *h;
1290 struct ip_conntrack_tuple tuple;
1291
1292 IP_CT_TUPLE_U_BLANK(&tuple);
1293 tuple.src.ip = inet->rcv_saddr;
1294 tuple.src.u.tcp.port = inet->sport;
1295 tuple.dst.ip = inet->daddr;
1296 tuple.dst.u.tcp.port = inet->dport;
1297 tuple.dst.protonum = IPPROTO_TCP;
1298
1299 /* We only do TCP at the moment: is there a better way? */
1300 if (strcmp(sk->sk_prot->name, "TCP")) {
1301 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1302 return -ENOPROTOOPT;
1303 }
1304
1305 if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1306 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1307 *len, sizeof(struct sockaddr_in));
1308 return -EINVAL;
1309 }
1310
1311 h = ip_conntrack_find_get(&tuple, NULL);
1312 if (h) {
1313 struct sockaddr_in sin;
1314 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1315
1316 sin.sin_family = AF_INET;
1317 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1318 .tuple.dst.u.tcp.port;
1319 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1320 .tuple.dst.ip;
1321
1322 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1323 NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1324 ip_conntrack_put(ct);
1325 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1326 return -EFAULT;
1327 else
1328 return 0;
1329 }
1330 DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1331 NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1332 NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1333 return -ENOENT;
1334}
1335
1336static struct nf_sockopt_ops so_getorigdst = {
1337 .pf = PF_INET,
1338 .get_optmin = SO_ORIGINAL_DST,
1339 .get_optmax = SO_ORIGINAL_DST+1,
1340 .get = &getorigdst,
1341};
1342
1343static int kill_all(struct ip_conntrack *i, void *data)
1344{
1345 return 1;
1346}
1347
Patrick McHardyafe5c6b2005-12-05 13:33:50 -08001348void ip_conntrack_flush(void)
1349{
1350 ip_ct_iterate_cleanup(kill_all, NULL);
1351}
1352
Harald Welteeed75f12005-10-16 14:22:59 +02001353static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001354{
Harald Welteeed75f12005-10-16 14:22:59 +02001355 if (vmalloced)
1356 vfree(hash);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001357 else
Harald Welteeed75f12005-10-16 14:22:59 +02001358 free_pages((unsigned long)hash,
1359 get_order(sizeof(struct list_head) * size));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001360}
1361
Patrick McHardyafe5c6b2005-12-05 13:33:50 -08001362/* Mishearing the voices in his head, our hero wonders how he's
1363 supposed to kill the mall. */
1364void ip_conntrack_cleanup(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001365{
Patrick McHardyafe5c6b2005-12-05 13:33:50 -08001366 ip_ct_attach = NULL;
1367
Linus Torvalds1da177e2005-04-16 15:20:36 -07001368 /* This makes sure all current packets have passed through
1369 netfilter framework. Roll on, two-stage module
1370 delete... */
1371 synchronize_net();
Harald Welte080774a2005-08-09 19:32:58 -07001372
Patrick McHardya86888b2005-08-09 20:02:13 -07001373 ip_ct_event_cache_flush();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001374 i_see_dead_people:
Patrick McHardyafe5c6b2005-12-05 13:33:50 -08001375 ip_conntrack_flush();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001376 if (atomic_read(&ip_conntrack_count) != 0) {
1377 schedule();
1378 goto i_see_dead_people;
1379 }
Patrick McHardy21f930e2005-07-22 12:51:03 -07001380 /* wait until all references to ip_conntrack_untracked are dropped */
1381 while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1382 schedule();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001383
1384 kmem_cache_destroy(ip_conntrack_cachep);
1385 kmem_cache_destroy(ip_conntrack_expect_cachep);
Harald Welteeed75f12005-10-16 14:22:59 +02001386 free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1387 ip_conntrack_htable_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001388 nf_unregister_sockopt(&so_getorigdst);
1389}
1390
Harald Welteeed75f12005-10-16 14:22:59 +02001391static struct list_head *alloc_hashtable(int size, int *vmalloced)
1392{
1393 struct list_head *hash;
1394 unsigned int i;
1395
1396 *vmalloced = 0;
1397 hash = (void*)__get_free_pages(GFP_KERNEL,
1398 get_order(sizeof(struct list_head)
1399 * size));
1400 if (!hash) {
1401 *vmalloced = 1;
1402 printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n");
1403 hash = vmalloc(sizeof(struct list_head) * size);
1404 }
1405
1406 if (hash)
1407 for (i = 0; i < size; i++)
1408 INIT_LIST_HEAD(&hash[i]);
1409
1410 return hash;
1411}
1412
Adrian Bunkd127e942005-11-29 16:28:18 -08001413static int set_hashsize(const char *val, struct kernel_param *kp)
Harald Welteeed75f12005-10-16 14:22:59 +02001414{
1415 int i, bucket, hashsize, vmalloced;
1416 int old_vmalloced, old_size;
1417 int rnd;
1418 struct list_head *hash, *old_hash;
1419 struct ip_conntrack_tuple_hash *h;
1420
1421 /* On boot, we can set this without any fancy locking. */
1422 if (!ip_conntrack_htable_size)
1423 return param_set_int(val, kp);
1424
1425 hashsize = simple_strtol(val, NULL, 0);
1426 if (!hashsize)
1427 return -EINVAL;
1428
1429 hash = alloc_hashtable(hashsize, &vmalloced);
1430 if (!hash)
1431 return -ENOMEM;
1432
1433 /* We have to rehash for the new table anyway, so we also can
1434 * use a new random seed */
1435 get_random_bytes(&rnd, 4);
1436
1437 write_lock_bh(&ip_conntrack_lock);
1438 for (i = 0; i < ip_conntrack_htable_size; i++) {
1439 while (!list_empty(&ip_conntrack_hash[i])) {
1440 h = list_entry(ip_conntrack_hash[i].next,
1441 struct ip_conntrack_tuple_hash, list);
1442 list_del(&h->list);
1443 bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1444 list_add_tail(&h->list, &hash[bucket]);
1445 }
1446 }
1447 old_size = ip_conntrack_htable_size;
1448 old_vmalloced = ip_conntrack_vmalloc;
1449 old_hash = ip_conntrack_hash;
1450
1451 ip_conntrack_htable_size = hashsize;
1452 ip_conntrack_vmalloc = vmalloced;
1453 ip_conntrack_hash = hash;
1454 ip_conntrack_hash_rnd = rnd;
1455 write_unlock_bh(&ip_conntrack_lock);
1456
1457 free_conntrack_hash(old_hash, old_vmalloced, old_size);
1458 return 0;
1459}
1460
1461module_param_call(hashsize, set_hashsize, param_get_uint,
1462 &ip_conntrack_htable_size, 0600);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001463
1464int __init ip_conntrack_init(void)
1465{
1466 unsigned int i;
1467 int ret;
1468
1469 /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
1470 * machine has 256 buckets. >= 1GB machines have 8192 buckets. */
Harald Welteeed75f12005-10-16 14:22:59 +02001471 if (!ip_conntrack_htable_size) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001472 ip_conntrack_htable_size
1473 = (((num_physpages << PAGE_SHIFT) / 16384)
1474 / sizeof(struct list_head));
1475 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1476 ip_conntrack_htable_size = 8192;
1477 if (ip_conntrack_htable_size < 16)
1478 ip_conntrack_htable_size = 16;
1479 }
1480 ip_conntrack_max = 8 * ip_conntrack_htable_size;
1481
1482 printk("ip_conntrack version %s (%u buckets, %d max)"
1483 " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1484 ip_conntrack_htable_size, ip_conntrack_max,
1485 sizeof(struct ip_conntrack));
1486
1487 ret = nf_register_sockopt(&so_getorigdst);
1488 if (ret != 0) {
1489 printk(KERN_ERR "Unable to register netfilter socket option\n");
1490 return ret;
1491 }
1492
Harald Welteeed75f12005-10-16 14:22:59 +02001493 ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size,
1494 &ip_conntrack_vmalloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001495 if (!ip_conntrack_hash) {
1496 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1497 goto err_unreg_sockopt;
1498 }
1499
1500 ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1501 sizeof(struct ip_conntrack), 0,
1502 0, NULL, NULL);
1503 if (!ip_conntrack_cachep) {
1504 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1505 goto err_free_hash;
1506 }
1507
1508 ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1509 sizeof(struct ip_conntrack_expect),
1510 0, 0, NULL, NULL);
1511 if (!ip_conntrack_expect_cachep) {
1512 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1513 goto err_free_conntrack_slab;
1514 }
1515
1516 /* Don't NEED lock here, but good form anyway. */
Patrick McHardye45b1be2005-06-21 14:01:30 -07001517 write_lock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001518 for (i = 0; i < MAX_IP_CT_PROTO; i++)
1519 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1520 /* Sew in builtin protocols. */
1521 ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1522 ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1523 ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
Patrick McHardye45b1be2005-06-21 14:01:30 -07001524 write_unlock_bh(&ip_conntrack_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001525
Linus Torvalds1da177e2005-04-16 15:20:36 -07001526 /* For use by ipt_REJECT */
1527 ip_ct_attach = ip_conntrack_attach;
1528
1529 /* Set up fake conntrack:
1530 - to never be deleted, not in any hashes */
1531 atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1532 /* - and look it like as a confirmed connection */
1533 set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1534
1535 return ret;
1536
1537err_free_conntrack_slab:
1538 kmem_cache_destroy(ip_conntrack_cachep);
1539err_free_hash:
Harald Welteeed75f12005-10-16 14:22:59 +02001540 free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1541 ip_conntrack_htable_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001542err_unreg_sockopt:
1543 nf_unregister_sockopt(&so_getorigdst);
1544
1545 return -ENOMEM;
1546}