Blame - net/netfilter/nf_conntrack_core.c - kernel/msm

blob: ea094b231d627f5e19ea46d7d87598cfd9eec383 [file] [log] [blame]

Yasuyuki Kozakai	9fb9cbb	2005-11-09 16:38:16 -0800	[diff] [blame]	1	/* Connection state tracking for netfilter. This is separated from,
				2	but required by, the NAT layer; it can also be used by an iptables
				3	extension. */
				4
				5	/* (C) 1999-2001 Paul `Rusty' Russell
				6	* (C) 2002-2005 Netfilter Core Team <coreteam@netfilter.org>
				7	* (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
				8	*
				9	* This program is free software; you can redistribute it and/or modify
				10	* it under the terms of the GNU General Public License version 2 as
				11	* published by the Free Software Foundation.
				12	*
				13	* 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
				14	* - new API and handling of conntrack/nat helpers
				15	* - now capable of multiple expectations for one master
				16	* 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
				17	* - add usage/reference counts to ip_conntrack_expect
				18	* - export ip_conntrack[_expect]_{find_get,put} functions
				19	* 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
				20	* - generalize L3 protocol denendent part.
				21	* 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
				22	* - add support various size of conntrack structures.
				23	*
				24	* Derived from net/ipv4/netfilter/ip_conntrack_core.c
				25	*/
				26
				27	#include <linux/config.h>
				28	#include <linux/types.h>
				29	#include <linux/netfilter.h>
				30	#include <linux/module.h>
				31	#include <linux/skbuff.h>
				32	#include <linux/proc_fs.h>
				33	#include <linux/vmalloc.h>
				34	#include <linux/stddef.h>
				35	#include <linux/slab.h>
				36	#include <linux/random.h>
				37	#include <linux/jhash.h>
				38	#include <linux/err.h>
				39	#include <linux/percpu.h>
				40	#include <linux/moduleparam.h>
				41	#include <linux/notifier.h>
				42	#include <linux/kernel.h>
				43	#include <linux/netdevice.h>
				44	#include <linux/socket.h>
				45
				46	/* This rwlock protects the main hash table, protocol/helper/expected
				47	registrations, conntrack timers*/
				48	#define ASSERT_READ_LOCK(x)
				49	#define ASSERT_WRITE_LOCK(x)
				50
				51	#include <net/netfilter/nf_conntrack.h>
				52	#include <net/netfilter/nf_conntrack_l3proto.h>
				53	#include <net/netfilter/nf_conntrack_protocol.h>
				54	#include <net/netfilter/nf_conntrack_helper.h>
				55	#include <net/netfilter/nf_conntrack_core.h>
				56	#include <linux/netfilter_ipv4/listhelp.h>
				57
				58	#define NF_CONNTRACK_VERSION "0.4.1"
				59
				60	#if 0
				61	#define DEBUGP printk
				62	#else
				63	#define DEBUGP(format, args...)
				64	#endif
				65
				66	DEFINE_RWLOCK(nf_conntrack_lock);
				67
				68	/* nf_conntrack_standalone needs this */
				69	atomic_t nf_conntrack_count = ATOMIC_INIT(0);
				70
				71	void (nf_conntrack_destroyed)(struct nf_conn conntrack) = NULL;
				72	LIST_HEAD(nf_conntrack_expect_list);
				73	struct nf_conntrack_protocol **nf_ct_protos[PF_MAX];
				74	struct nf_conntrack_l3proto *nf_ct_l3protos[PF_MAX];
				75	static LIST_HEAD(helpers);
				76	unsigned int nf_conntrack_htable_size = 0;
				77	int nf_conntrack_max;
				78	struct list_head *nf_conntrack_hash;
				79	static kmem_cache_t *nf_conntrack_expect_cachep;
				80	struct nf_conn nf_conntrack_untracked;
				81	unsigned int nf_ct_log_invalid;
				82	static LIST_HEAD(unconfirmed);
				83	static int nf_conntrack_vmalloc;
				84
				85	#ifdef CONFIG_NF_CONNTRACK_EVENTS
				86	struct notifier_block *nf_conntrack_chain;
				87	struct notifier_block *nf_conntrack_expect_chain;
				88
				89	DEFINE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache);
				90
				91	/* deliver cached events and clear cache entry - must be called with locally
				92	* disabled softirqs */
				93	static inline void
				94	__nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache)
				95	{
				96	DEBUGP("ecache: delivering events for %p\n", ecache->ct);
				97	if (nf_ct_is_confirmed(ecache->ct) && !nf_ct_is_dying(ecache->ct)
				98	&& ecache->events)
				99	notifier_call_chain(&nf_conntrack_chain, ecache->events,
				100	ecache->ct);
				101
				102	ecache->events = 0;
				103	nf_ct_put(ecache->ct);
				104	ecache->ct = NULL;
				105	}
				106
				107	/* Deliver all cached events for a particular conntrack. This is called
				108	* by code prior to async packet handling for freeing the skb */
				109	void nf_ct_deliver_cached_events(const struct nf_conn *ct)
				110	{
				111	struct nf_conntrack_ecache *ecache;
				112
				113	local_bh_disable();
				114	ecache = &__get_cpu_var(nf_conntrack_ecache);
				115	if (ecache->ct == ct)
				116	__nf_ct_deliver_cached_events(ecache);
				117	local_bh_enable();
				118	}
				119
				120	/* Deliver cached events for old pending events, if current conntrack != old */
				121	void __nf_ct_event_cache_init(struct nf_conn *ct)
				122	{
				123	struct nf_conntrack_ecache *ecache;
				124
				125	/* take care of delivering potentially old events */
				126	ecache = &__get_cpu_var(nf_conntrack_ecache);
				127	BUG_ON(ecache->ct == ct);
				128	if (ecache->ct)
				129	__nf_ct_deliver_cached_events(ecache);
				130	/* initialize for this conntrack/packet */
				131	ecache->ct = ct;
				132	nf_conntrack_get(&ct->ct_general);
				133	}
				134
				135	/* flush the event cache - touches other CPU's data and must not be called
				136	* while packets are still passing through the code */
				137	static void nf_ct_event_cache_flush(void)
				138	{
				139	struct nf_conntrack_ecache *ecache;
				140	int cpu;
				141
				142	for_each_cpu(cpu) {
				143	ecache = &per_cpu(nf_conntrack_ecache, cpu);
				144	if (ecache->ct)
				145	nf_ct_put(ecache->ct);
				146	}
				147	}
				148	#else
				149	static inline void nf_ct_event_cache_flush(void) {}
				150	#endif /* CONFIG_NF_CONNTRACK_EVENTS */
				151
				152	DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat);
				153	EXPORT_PER_CPU_SYMBOL(nf_conntrack_stat);
				154
				155	/*
				156	* This scheme offers various size of "struct nf_conn" dependent on
				157	* features(helper, nat, ...)
				158	*/
				159
				160	#define NF_CT_FEATURES_NAMELEN 256
				161	static struct {
				162	/* name of slab cache. printed in /proc/slabinfo */
				163	char *name;
				164
				165	/* size of slab cache */
				166	size_t size;
				167
				168	/* slab cache pointer */
				169	kmem_cache_t *cachep;
				170
				171	/* allocated slab cache + modules which uses this slab cache */
				172	int use;
				173
				174	/* Initialization */
				175	int (init_conntrack)(struct nf_conn , u_int32_t);
				176
				177	} nf_ct_cache[NF_CT_F_NUM];
				178
				179	/* protect members of nf_ct_cache except of "use" */
				180	DEFINE_RWLOCK(nf_ct_cache_lock);
				181
				182	/* This avoids calling kmem_cache_create() with same name simultaneously */
				183	DECLARE_MUTEX(nf_ct_cache_mutex);
				184
				185	extern struct nf_conntrack_protocol nf_conntrack_generic_protocol;
				186	struct nf_conntrack_protocol *
				187	nf_ct_find_proto(u_int16_t l3proto, u_int8_t protocol)
				188	{
				189	if (unlikely(nf_ct_protos[l3proto] == NULL))
				190	return &nf_conntrack_generic_protocol;
				191
				192	return nf_ct_protos[l3proto][protocol];
				193	}
				194
				195	static int nf_conntrack_hash_rnd_initted;
				196	static unsigned int nf_conntrack_hash_rnd;
				197
				198	static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
				199	unsigned int size, unsigned int rnd)
				200	{
				201	unsigned int a, b;
				202	a = jhash((void *)tuple->src.u3.all, sizeof(tuple->src.u3.all),
				203	((tuple->src.l3num) << 16) \| tuple->dst.protonum);
				204	b = jhash((void *)tuple->dst.u3.all, sizeof(tuple->dst.u3.all),
				205	(tuple->src.u.all << 16) \| tuple->dst.u.all);
				206
				207	return jhash_2words(a, b, rnd) % size;
				208	}
				209
				210	static inline u_int32_t hash_conntrack(const struct nf_conntrack_tuple *tuple)
				211	{
				212	return __hash_conntrack(tuple, nf_conntrack_htable_size,
				213	nf_conntrack_hash_rnd);
				214	}
				215
				216	/* Initialize "struct nf_conn" which has spaces for helper */
				217	static int
				218	init_conntrack_for_helper(struct nf_conn *conntrack, u_int32_t features)
				219	{
				220
				221	conntrack->help = (union nf_conntrack_help *)
				222	(((unsigned long)conntrack->data
				223	+ (__alignof__(union nf_conntrack_help) - 1))
				224	& (~((unsigned long)(__alignof__(union nf_conntrack_help) -1))));
				225	return 0;
				226	}
				227
				228	int nf_conntrack_register_cache(u_int32_t features, const char *name,
				229	size_t size,
				230	int (init)(struct nf_conn , u_int32_t))
				231	{
				232	int ret = 0;
				233	char *cache_name;
				234	kmem_cache_t *cachep;
				235
				236	DEBUGP("nf_conntrack_register_cache: features=0x%x, name=%s, size=%d\n",
				237	features, name, size);
				238
				239	if (features < NF_CT_F_BASIC \|\| features >= NF_CT_F_NUM) {
				240	DEBUGP("nf_conntrack_register_cache: invalid features.: 0x%x\n",
				241	features);
				242	return -EINVAL;
				243	}
				244
				245	down(&nf_ct_cache_mutex);
				246
				247	write_lock_bh(&nf_ct_cache_lock);
				248	/* e.g: multiple helpers are loaded */
				249	if (nf_ct_cache[features].use > 0) {
				250	DEBUGP("nf_conntrack_register_cache: already resisterd.\n");
				251	if ((!strncmp(nf_ct_cache[features].name, name,
				252	NF_CT_FEATURES_NAMELEN))
				253	&& nf_ct_cache[features].size == size
				254	&& nf_ct_cache[features].init_conntrack == init) {
				255	DEBUGP("nf_conntrack_register_cache: reusing.\n");
				256	nf_ct_cache[features].use++;
				257	ret = 0;
				258	} else
				259	ret = -EBUSY;
				260
				261	write_unlock_bh(&nf_ct_cache_lock);
				262	up(&nf_ct_cache_mutex);
				263	return ret;
				264	}
				265	write_unlock_bh(&nf_ct_cache_lock);
				266
				267	/*
				268	* The memory space for name of slab cache must be alive until
				269	* cache is destroyed.
				270	*/
				271	cache_name = kmalloc(sizeof(char)*NF_CT_FEATURES_NAMELEN, GFP_ATOMIC);
				272	if (cache_name == NULL) {
				273	DEBUGP("nf_conntrack_register_cache: can't alloc cache_name\n");
				274	ret = -ENOMEM;
				275	goto out_up_mutex;
				276	}
				277
				278	if (strlcpy(cache_name, name, NF_CT_FEATURES_NAMELEN)
				279	>= NF_CT_FEATURES_NAMELEN) {
				280	printk("nf_conntrack_register_cache: name too long\n");
				281	ret = -EINVAL;
				282	goto out_free_name;
				283	}
				284
				285	cachep = kmem_cache_create(cache_name, size, 0, 0,
				286	NULL, NULL);
				287	if (!cachep) {
				288	printk("nf_conntrack_register_cache: Can't create slab cache "
				289	"for the features = 0x%x\n", features);
				290	ret = -ENOMEM;
				291	goto out_free_name;
				292	}
				293
				294	write_lock_bh(&nf_ct_cache_lock);
				295	nf_ct_cache[features].use = 1;
				296	nf_ct_cache[features].size = size;
				297	nf_ct_cache[features].init_conntrack = init;
				298	nf_ct_cache[features].cachep = cachep;
				299	nf_ct_cache[features].name = cache_name;
				300	write_unlock_bh(&nf_ct_cache_lock);
				301
				302	goto out_up_mutex;
				303
				304	out_free_name:
				305	kfree(cache_name);
				306	out_up_mutex:
				307	up(&nf_ct_cache_mutex);
				308	return ret;
				309	}
				310
				311	/* FIXME: In the current, only nf_conntrack_cleanup() can call this function. */
				312	void nf_conntrack_unregister_cache(u_int32_t features)
				313	{
				314	kmem_cache_t *cachep;
				315	char *name;
				316
				317	/*
				318	* This assures that kmem_cache_create() isn't called before destroying
				319	* slab cache.
				320	*/
				321	DEBUGP("nf_conntrack_unregister_cache: 0x%04x\n", features);
				322	down(&nf_ct_cache_mutex);
				323
				324	write_lock_bh(&nf_ct_cache_lock);
				325	if (--nf_ct_cache[features].use > 0) {
				326	write_unlock_bh(&nf_ct_cache_lock);
				327	up(&nf_ct_cache_mutex);
				328	return;
				329	}
				330	cachep = nf_ct_cache[features].cachep;
				331	name = nf_ct_cache[features].name;
				332	nf_ct_cache[features].cachep = NULL;
				333	nf_ct_cache[features].name = NULL;
				334	nf_ct_cache[features].init_conntrack = NULL;
				335	nf_ct_cache[features].size = 0;
				336	write_unlock_bh(&nf_ct_cache_lock);
				337
				338	synchronize_net();
				339
				340	kmem_cache_destroy(cachep);
				341	kfree(name);
				342
				343	up(&nf_ct_cache_mutex);
				344	}
				345
				346	int
				347	nf_ct_get_tuple(const struct sk_buff *skb,
				348	unsigned int nhoff,
				349	unsigned int dataoff,
				350	u_int16_t l3num,
				351	u_int8_t protonum,
				352	struct nf_conntrack_tuple *tuple,
				353	const struct nf_conntrack_l3proto *l3proto,
				354	const struct nf_conntrack_protocol *protocol)
				355	{
				356	NF_CT_TUPLE_U_BLANK(tuple);
				357
				358	tuple->src.l3num = l3num;
				359	if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
				360	return 0;
				361
				362	tuple->dst.protonum = protonum;
				363	tuple->dst.dir = IP_CT_DIR_ORIGINAL;
				364
				365	return protocol->pkt_to_tuple(skb, dataoff, tuple);
				366	}
				367
				368	int
				369	nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
				370	const struct nf_conntrack_tuple *orig,
				371	const struct nf_conntrack_l3proto *l3proto,
				372	const struct nf_conntrack_protocol *protocol)
				373	{
				374	NF_CT_TUPLE_U_BLANK(inverse);
				375
				376	inverse->src.l3num = orig->src.l3num;
				377	if (l3proto->invert_tuple(inverse, orig) == 0)
				378	return 0;
				379
				380	inverse->dst.dir = !orig->dst.dir;
				381
				382	inverse->dst.protonum = orig->dst.protonum;
				383	return protocol->invert_tuple(inverse, orig);
				384	}
				385
				386	/* nf_conntrack_expect helper functions */
				387	static void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
				388	{
				389	ASSERT_WRITE_LOCK(&nf_conntrack_lock);
				390	NF_CT_ASSERT(!timer_pending(&exp_timeout));
				391	list_del(&exp->list);
				392	NF_CT_STAT_INC(expect_delete);
				393	exp->master->expecting--;
				394	nf_conntrack_expect_put(exp);
				395	}
				396
				397	static void expectation_timed_out(unsigned long ul_expect)
				398	{
				399	struct nf_conntrack_expect exp = (void )ul_expect;
				400
				401	write_lock_bh(&nf_conntrack_lock);
				402	nf_ct_unlink_expect(exp);
				403	write_unlock_bh(&nf_conntrack_lock);
				404	nf_conntrack_expect_put(exp);
				405	}
				406
				407	/* If an expectation for this connection is found, it gets delete from
				408	* global list then returned. */
				409	static struct nf_conntrack_expect *
				410	find_expectation(const struct nf_conntrack_tuple *tuple)
				411	{
				412	struct nf_conntrack_expect *i;
				413
				414	list_for_each_entry(i, &nf_conntrack_expect_list, list) {
				415	/* If master is not in hash table yet (ie. packet hasn't left
				416	this machine yet), how can other end know about expected?
				417	Hence these are not the droids you are looking for (if
				418	master ct never got confirmed, we'd hold a reference to it
				419	and weird things would happen to future packets). */
				420	if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
				421	&& nf_ct_is_confirmed(i->master)) {
				422	if (i->flags & NF_CT_EXPECT_PERMANENT) {
				423	atomic_inc(&i->use);
				424	return i;
				425	} else if (del_timer(&i->timeout)) {
				426	nf_ct_unlink_expect(i);
				427	return i;
				428	}
				429	}
				430	}
				431	return NULL;
				432	}
				433
				434	/* delete all expectations for this conntrack */
				435	static void remove_expectations(struct nf_conn *ct)
				436	{
				437	struct nf_conntrack_expect i, tmp;
				438
				439	/* Optimization: most connection never expect any others. */
				440	if (ct->expecting == 0)
				441	return;
				442
				443	list_for_each_entry_safe(i, tmp, &nf_conntrack_expect_list, list) {
				444	if (i->master == ct && del_timer(&i->timeout)) {
				445	nf_ct_unlink_expect(i);
				446	nf_conntrack_expect_put(i);
				447	}
				448	}
				449	}
				450
				451	static void
				452	clean_from_lists(struct nf_conn *ct)
				453	{
				454	unsigned int ho, hr;
				455
				456	DEBUGP("clean_from_lists(%p)\n", ct);
				457	ASSERT_WRITE_LOCK(&nf_conntrack_lock);
				458
				459	ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
				460	hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
				461	LIST_DELETE(&nf_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
				462	LIST_DELETE(&nf_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
				463
				464	/* Destroy all pending expectations */
				465	remove_expectations(ct);
				466	}
				467
				468	static void
				469	destroy_conntrack(struct nf_conntrack *nfct)
				470	{
				471	struct nf_conn ct = (struct nf_conn )nfct;
				472	struct nf_conntrack_l3proto *l3proto;
				473	struct nf_conntrack_protocol *proto;
				474
				475	DEBUGP("destroy_conntrack(%p)\n", ct);
				476	NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
				477	NF_CT_ASSERT(!timer_pending(&ct->timeout));
				478
				479	nf_conntrack_event(IPCT_DESTROY, ct);
				480	set_bit(IPS_DYING_BIT, &ct->status);
				481
				482	/* To make sure we don't get any weird locking issues here:
				483	* destroy_conntrack() MUST NOT be called with a write lock
				484	* to nf_conntrack_lock!!! -HW */
				485	l3proto = nf_ct_find_l3proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num);
				486	if (l3proto && l3proto->destroy)
				487	l3proto->destroy(ct);
				488
				489	proto = nf_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num,
				490	ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
				491	if (proto && proto->destroy)
				492	proto->destroy(ct);
				493
				494	if (nf_conntrack_destroyed)
				495	nf_conntrack_destroyed(ct);
				496
				497	write_lock_bh(&nf_conntrack_lock);
				498	/* Expectations will have been removed in clean_from_lists,
				499	* except TFTP can create an expectation on the first packet,
				500	* before connection is in the list, so we need to clean here,
				501	* too. */
				502	remove_expectations(ct);
				503
				504	/* We overload first tuple to link into unconfirmed list. */
				505	if (!nf_ct_is_confirmed(ct)) {
				506	BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
				507	list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
				508	}
				509
				510	NF_CT_STAT_INC(delete);
				511	write_unlock_bh(&nf_conntrack_lock);
				512
				513	if (ct->master)
				514	nf_ct_put(ct->master);
				515
				516	DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
				517	nf_conntrack_free(ct);
				518	}
				519
				520	static void death_by_timeout(unsigned long ul_conntrack)
				521	{
				522	struct nf_conn ct = (void )ul_conntrack;
				523
				524	write_lock_bh(&nf_conntrack_lock);
				525	/* Inside lock so preempt is disabled on module removal path.
				526	* Otherwise we can get spurious warnings. */
				527	NF_CT_STAT_INC(delete_list);
				528	clean_from_lists(ct);
				529	write_unlock_bh(&nf_conntrack_lock);
				530	nf_ct_put(ct);
				531	}
				532
				533	static inline int
				534	conntrack_tuple_cmp(const struct nf_conntrack_tuple_hash *i,
				535	const struct nf_conntrack_tuple *tuple,
				536	const struct nf_conn *ignored_conntrack)
				537	{
				538	ASSERT_READ_LOCK(&nf_conntrack_lock);
				539	return nf_ct_tuplehash_to_ctrack(i) != ignored_conntrack
				540	&& nf_ct_tuple_equal(tuple, &i->tuple);
				541	}
				542
				543	static struct nf_conntrack_tuple_hash *
				544	__nf_conntrack_find(const struct nf_conntrack_tuple *tuple,
				545	const struct nf_conn *ignored_conntrack)
				546	{
				547	struct nf_conntrack_tuple_hash *h;
				548	unsigned int hash = hash_conntrack(tuple);
				549
				550	ASSERT_READ_LOCK(&nf_conntrack_lock);
				551	list_for_each_entry(h, &nf_conntrack_hash[hash], list) {
				552	if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
				553	NF_CT_STAT_INC(found);
				554	return h;
				555	}
				556	NF_CT_STAT_INC(searched);
				557	}
				558
				559	return NULL;
				560	}
				561
				562	/* Find a connection corresponding to a tuple. */
				563	struct nf_conntrack_tuple_hash *
				564	nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple,
				565	const struct nf_conn *ignored_conntrack)
				566	{
				567	struct nf_conntrack_tuple_hash *h;
				568
				569	read_lock_bh(&nf_conntrack_lock);
				570	h = __nf_conntrack_find(tuple, ignored_conntrack);
				571	if (h)
				572	atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
				573	read_unlock_bh(&nf_conntrack_lock);
				574
				575	return h;
				576	}
				577
				578	/* Confirm a connection given skb; places it in hash table */
				579	int
				580	__nf_conntrack_confirm(struct sk_buff **pskb)
				581	{
				582	unsigned int hash, repl_hash;
				583	struct nf_conn *ct;
				584	enum ip_conntrack_info ctinfo;
				585
				586	ct = nf_ct_get(*pskb, &ctinfo);
				587
				588	/* ipt_REJECT uses nf_conntrack_attach to attach related
				589	ICMP/TCP RST packets in other direction. Actual packet
				590	which created connection will be IP_CT_NEW or for an
				591	expected connection, IP_CT_RELATED. */
				592	if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
				593	return NF_ACCEPT;
				594
				595	hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
				596	repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
				597
				598	/* We're not in hash table, and we refuse to set up related
				599	connections for unconfirmed conns. But packet copies and
				600	REJECT will give spurious warnings here. */
				601	/* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
				602
				603	/* No external references means noone else could have
				604	confirmed us. */
				605	NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
				606	DEBUGP("Confirming conntrack %p\n", ct);
				607
				608	write_lock_bh(&nf_conntrack_lock);
				609
				610	/* See if there's one in the list already, including reverse:
				611	NAT could have grabbed it without realizing, since we're
				612	not in the hash. If there is, we lost race. */
				613	if (!LIST_FIND(&nf_conntrack_hash[hash],
				614	conntrack_tuple_cmp,
				615	struct nf_conntrack_tuple_hash *,
				616	&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
				617	&& !LIST_FIND(&nf_conntrack_hash[repl_hash],
				618	conntrack_tuple_cmp,
				619	struct nf_conntrack_tuple_hash *,
				620	&ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
				621	/* Remove from unconfirmed list */
				622	list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
				623
				624	list_prepend(&nf_conntrack_hash[hash],
				625	&ct->tuplehash[IP_CT_DIR_ORIGINAL]);
				626	list_prepend(&nf_conntrack_hash[repl_hash],
				627	&ct->tuplehash[IP_CT_DIR_REPLY]);
				628	/* Timer relative to confirmation time, not original
				629	setting time, otherwise we'd get timer wrap in
				630	weird delay cases. */
				631	ct->timeout.expires += jiffies;
				632	add_timer(&ct->timeout);
				633	atomic_inc(&ct->ct_general.use);
				634	set_bit(IPS_CONFIRMED_BIT, &ct->status);
				635	NF_CT_STAT_INC(insert);
				636	write_unlock_bh(&nf_conntrack_lock);
				637	if (ct->helper)
				638	nf_conntrack_event_cache(IPCT_HELPER, *pskb);
				639	#ifdef CONFIG_NF_NAT_NEEDED
				640	if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) \|\|
				641	test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
				642	nf_conntrack_event_cache(IPCT_NATINFO, *pskb);
				643	#endif
				644	nf_conntrack_event_cache(master_ct(ct) ?
				645	IPCT_RELATED : IPCT_NEW, *pskb);
				646	return NF_ACCEPT;
				647	}
				648
				649	NF_CT_STAT_INC(insert_failed);
				650	write_unlock_bh(&nf_conntrack_lock);
				651	return NF_DROP;
				652	}
				653
				654	/* Returns true if a connection correspondings to the tuple (required
				655	for NAT). */
				656	int
				657	nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
				658	const struct nf_conn *ignored_conntrack)
				659	{
				660	struct nf_conntrack_tuple_hash *h;
				661
				662	read_lock_bh(&nf_conntrack_lock);
				663	h = __nf_conntrack_find(tuple, ignored_conntrack);
				664	read_unlock_bh(&nf_conntrack_lock);
				665
				666	return h != NULL;
				667	}
				668
				669	/* There's a small race here where we may free a just-assured
				670	connection. Too bad: we're in trouble anyway. */
				671	static inline int unreplied(const struct nf_conntrack_tuple_hash *i)
				672	{
				673	return !(test_bit(IPS_ASSURED_BIT,
				674	&nf_ct_tuplehash_to_ctrack(i)->status));
				675	}
				676
				677	static int early_drop(struct list_head *chain)
				678	{
				679	/* Traverse backwards: gives us oldest, which is roughly LRU */
				680	struct nf_conntrack_tuple_hash *h;
				681	struct nf_conn *ct = NULL;
				682	int dropped = 0;
				683
				684	read_lock_bh(&nf_conntrack_lock);
				685	h = LIST_FIND_B(chain, unreplied, struct nf_conntrack_tuple_hash *);
				686	if (h) {
				687	ct = nf_ct_tuplehash_to_ctrack(h);
				688	atomic_inc(&ct->ct_general.use);
				689	}
				690	read_unlock_bh(&nf_conntrack_lock);
				691
				692	if (!ct)
				693	return dropped;
				694
				695	if (del_timer(&ct->timeout)) {
				696	death_by_timeout((unsigned long)ct);
				697	dropped = 1;
				698	NF_CT_STAT_INC(early_drop);
				699	}
				700	nf_ct_put(ct);
				701	return dropped;
				702	}
				703
				704	static inline int helper_cmp(const struct nf_conntrack_helper *i,
				705	const struct nf_conntrack_tuple *rtuple)
				706	{
				707	return nf_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
				708	}
				709
				710	static struct nf_conntrack_helper *
				711	nf_ct_find_helper(const struct nf_conntrack_tuple *tuple)
				712	{
				713	return LIST_FIND(&helpers, helper_cmp,
				714	struct nf_conntrack_helper *,
				715	tuple);
				716	}
				717
				718	static struct nf_conn *
				719	__nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
				720	const struct nf_conntrack_tuple *repl,
				721	const struct nf_conntrack_l3proto *l3proto)
				722	{
				723	struct nf_conn *conntrack = NULL;
				724	u_int32_t features = 0;
				725
				726	if (!nf_conntrack_hash_rnd_initted) {
				727	get_random_bytes(&nf_conntrack_hash_rnd, 4);
				728	nf_conntrack_hash_rnd_initted = 1;
				729	}
				730
				731	if (nf_conntrack_max
				732	&& atomic_read(&nf_conntrack_count) >= nf_conntrack_max) {
				733	unsigned int hash = hash_conntrack(orig);
				734	/* Try dropping from this hash chain. */
				735	if (!early_drop(&nf_conntrack_hash[hash])) {
				736	if (net_ratelimit())
				737	printk(KERN_WARNING
				738	"nf_conntrack: table full, dropping"
				739	" packet.\n");
				740	return ERR_PTR(-ENOMEM);
				741	}
				742	}
				743
				744	/* find features needed by this conntrack. */
				745	features = l3proto->get_features(orig);
				746	read_lock_bh(&nf_conntrack_lock);
				747	if (nf_ct_find_helper(repl) != NULL)
				748	features \|= NF_CT_F_HELP;
				749	read_unlock_bh(&nf_conntrack_lock);
				750
				751	DEBUGP("nf_conntrack_alloc: features=0x%x\n", features);
				752
				753	read_lock_bh(&nf_ct_cache_lock);
				754
				755	if (!nf_ct_cache[features].use) {
				756	DEBUGP("nf_conntrack_alloc: not supported features = 0x%x\n",
				757	features);
				758	goto out;
				759	}
				760
				761	conntrack = kmem_cache_alloc(nf_ct_cache[features].cachep, GFP_ATOMIC);
				762	if (conntrack == NULL) {
				763	DEBUGP("nf_conntrack_alloc: Can't alloc conntrack from cache\n");
				764	goto out;
				765	}
				766
				767	memset(conntrack, 0, nf_ct_cache[features].size);
				768	conntrack->features = features;
				769	if (nf_ct_cache[features].init_conntrack &&
				770	nf_ct_cache[features].init_conntrack(conntrack, features) < 0) {
				771	DEBUGP("nf_conntrack_alloc: failed to init\n");
				772	kmem_cache_free(nf_ct_cache[features].cachep, conntrack);
				773	conntrack = NULL;
				774	goto out;
				775	}
				776
				777	atomic_set(&conntrack->ct_general.use, 1);
				778	conntrack->ct_general.destroy = destroy_conntrack;
				779	conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
				780	conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
				781	/* Don't set timer yet: wait for confirmation */
				782	init_timer(&conntrack->timeout);
				783	conntrack->timeout.data = (unsigned long)conntrack;
				784	conntrack->timeout.function = death_by_timeout;
				785
				786	atomic_inc(&nf_conntrack_count);
				787	out:
				788	read_unlock_bh(&nf_ct_cache_lock);
				789	return conntrack;
				790	}
				791
				792	struct nf_conn nf_conntrack_alloc(const struct nf_conntrack_tuple orig,
				793	const struct nf_conntrack_tuple *repl)
				794	{
				795	struct nf_conntrack_l3proto *l3proto;
				796
				797	l3proto = nf_ct_find_l3proto(orig->src.l3num);
				798	return __nf_conntrack_alloc(orig, repl, l3proto);
				799	}
				800
				801	void nf_conntrack_free(struct nf_conn *conntrack)
				802	{
				803	u_int32_t features = conntrack->features;
				804	NF_CT_ASSERT(features >= NF_CT_F_BASIC && features < NF_CT_F_NUM);
				805	DEBUGP("nf_conntrack_free: features = 0x%x, conntrack=%p\n", features,
				806	conntrack);
				807	kmem_cache_free(nf_ct_cache[features].cachep, conntrack);
				808	atomic_dec(&nf_conntrack_count);
				809	}
				810
				811	/* Allocate a new conntrack: we return -ENOMEM if classification
				812	failed due to stress. Otherwise it really is unclassifiable. */
				813	static struct nf_conntrack_tuple_hash *
				814	init_conntrack(const struct nf_conntrack_tuple *tuple,
				815	struct nf_conntrack_l3proto *l3proto,
				816	struct nf_conntrack_protocol *protocol,
				817	struct sk_buff *skb,
				818	unsigned int dataoff)
				819	{
				820	struct nf_conn *conntrack;
				821	struct nf_conntrack_tuple repl_tuple;
				822	struct nf_conntrack_expect *exp;
				823
				824	if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, protocol)) {
				825	DEBUGP("Can't invert tuple.\n");
				826	return NULL;
				827	}
				828
				829	conntrack = __nf_conntrack_alloc(tuple, &repl_tuple, l3proto);
				830	if (conntrack == NULL \|\| IS_ERR(conntrack)) {
				831	DEBUGP("Can't allocate conntrack.\n");
				832	return (struct nf_conntrack_tuple_hash *)conntrack;
				833	}
				834
				835	if (!protocol->new(conntrack, skb, dataoff)) {
				836	nf_conntrack_free(conntrack);
				837	DEBUGP("init conntrack: can't track with proto module\n");
				838	return NULL;
				839	}
				840
				841	write_lock_bh(&nf_conntrack_lock);
				842	exp = find_expectation(tuple);
				843
				844	if (exp) {
				845	DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
				846	conntrack, exp);
				847	/* Welcome, Mr. Bond. We've been expecting you... */
				848	__set_bit(IPS_EXPECTED_BIT, &conntrack->status);
				849	conntrack->master = exp->master;
				850	#ifdef CONFIG_NF_CONNTRACK_MARK
				851	conntrack->mark = exp->master->mark;
				852	#endif
				853	nf_conntrack_get(&conntrack->master->ct_general);
				854	NF_CT_STAT_INC(expect_new);
				855	} else {
				856	conntrack->helper = nf_ct_find_helper(&repl_tuple);
				857
				858	NF_CT_STAT_INC(new);
				859	}
				860
				861	/* Overload tuple linked list to put us in unconfirmed list. */
				862	list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
				863
				864	write_unlock_bh(&nf_conntrack_lock);
				865
				866	if (exp) {
				867	if (exp->expectfn)
				868	exp->expectfn(conntrack, exp);
				869	nf_conntrack_expect_put(exp);
				870	}
				871
				872	return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
				873	}
				874
				875	/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
				876	static inline struct nf_conn *
				877	resolve_normal_ct(struct sk_buff *skb,
				878	unsigned int dataoff,
				879	u_int16_t l3num,
				880	u_int8_t protonum,
				881	struct nf_conntrack_l3proto *l3proto,
				882	struct nf_conntrack_protocol *proto,
				883	int *set_reply,
				884	enum ip_conntrack_info *ctinfo)
				885	{
				886	struct nf_conntrack_tuple tuple;
				887	struct nf_conntrack_tuple_hash *h;
				888	struct nf_conn *ct;
				889
				890	if (!nf_ct_get_tuple(skb, (unsigned int)(skb->nh.raw - skb->data),
				891	dataoff, l3num, protonum, &tuple, l3proto,
				892	proto)) {
				893	DEBUGP("resolve_normal_ct: Can't get tuple\n");
				894	return NULL;
				895	}
				896
				897	/* look for tuple match */
				898	h = nf_conntrack_find_get(&tuple, NULL);
				899	if (!h) {
				900	h = init_conntrack(&tuple, l3proto, proto, skb, dataoff);
				901	if (!h)
				902	return NULL;
				903	if (IS_ERR(h))
				904	return (void *)h;
				905	}
				906	ct = nf_ct_tuplehash_to_ctrack(h);
				907
				908	/* It exists; we have (non-exclusive) reference. */
				909	if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
				910	*ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
				911	/* Please set reply bit if this packet OK */
				912	*set_reply = 1;
				913	} else {
				914	/* Once we've had two way comms, always ESTABLISHED. */
				915	if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
				916	DEBUGP("nf_conntrack_in: normal packet for %p\n", ct);
				917	*ctinfo = IP_CT_ESTABLISHED;
				918	} else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
				919	DEBUGP("nf_conntrack_in: related packet for %p\n", ct);
				920	*ctinfo = IP_CT_RELATED;
				921	} else {
				922	DEBUGP("nf_conntrack_in: new packet for %p\n", ct);
				923	*ctinfo = IP_CT_NEW;
				924	}
				925	*set_reply = 0;
				926	}
				927	skb->nfct = &ct->ct_general;
				928	skb->nfctinfo = *ctinfo;
				929	return ct;
				930	}
				931
				932	unsigned int
				933	nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff **pskb)
				934	{
				935	struct nf_conn *ct;
				936	enum ip_conntrack_info ctinfo;
				937	struct nf_conntrack_l3proto *l3proto;
				938	struct nf_conntrack_protocol *proto;
				939	unsigned int dataoff;
				940	u_int8_t protonum;
				941	int set_reply = 0;
				942	int ret;
				943
				944	/* Previously seen (loopback or untracked)? Ignore. */
				945	if ((*pskb)->nfct) {
				946	NF_CT_STAT_INC(ignore);
				947	return NF_ACCEPT;
				948	}
				949
				950	l3proto = nf_ct_find_l3proto((u_int16_t)pf);
				951	if ((ret = l3proto->prepare(pskb, hooknum, &dataoff, &protonum)) <= 0) {
				952	DEBUGP("not prepared to track yet or error occured\n");
				953	return -ret;
				954	}
				955
				956	proto = nf_ct_find_proto((u_int16_t)pf, protonum);
				957
				958	/* It may be an special packet, error, unclean...
				959	* inverse of the return code tells to the netfilter
				960	* core what to do with the packet. */
				961	if (proto->error != NULL &&
				962	(ret = proto->error(*pskb, dataoff, &ctinfo, pf, hooknum)) <= 0) {
				963	NF_CT_STAT_INC(error);
				964	NF_CT_STAT_INC(invalid);
				965	return -ret;
				966	}
				967
				968	ct = resolve_normal_ct(*pskb, dataoff, pf, protonum, l3proto, proto,
				969	&set_reply, &ctinfo);
				970	if (!ct) {
				971	/* Not valid part of a connection */
				972	NF_CT_STAT_INC(invalid);
				973	return NF_ACCEPT;
				974	}
				975
				976	if (IS_ERR(ct)) {
				977	/* Too stressed to deal. */
				978	NF_CT_STAT_INC(drop);
				979	return NF_DROP;
				980	}
				981
				982	NF_CT_ASSERT((*pskb)->nfct);
				983
				984	ret = proto->packet(ct, *pskb, dataoff, ctinfo, pf, hooknum);
				985	if (ret < 0) {
				986	/* Invalid: inverse of the return code tells
				987	* the netfilter core what to do */
				988	DEBUGP("nf_conntrack_in: Can't track with proto module\n");
				989	nf_conntrack_put((*pskb)->nfct);
				990	(*pskb)->nfct = NULL;
				991	NF_CT_STAT_INC(invalid);
				992	return -ret;
				993	}
				994
				995	if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
				996	nf_conntrack_event_cache(IPCT_STATUS, *pskb);
				997
				998	return ret;
				999	}
				1000
				1001	int nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
				1002	const struct nf_conntrack_tuple *orig)
				1003	{
				1004	return nf_ct_invert_tuple(inverse, orig,
				1005	nf_ct_find_l3proto(orig->src.l3num),
				1006	nf_ct_find_proto(orig->src.l3num,
				1007	orig->dst.protonum));
				1008	}
				1009
				1010	/* Would two expected things clash? */
				1011	static inline int expect_clash(const struct nf_conntrack_expect *a,
				1012	const struct nf_conntrack_expect *b)
				1013	{
				1014	/* Part covered by intersection of masks must be unequal,
				1015	otherwise they clash */
				1016	struct nf_conntrack_tuple intersect_mask;
				1017	int count;
				1018
				1019	intersect_mask.src.l3num = a->mask.src.l3num & b->mask.src.l3num;
				1020	intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all;
				1021	intersect_mask.dst.u.all = a->mask.dst.u.all & b->mask.dst.u.all;
				1022	intersect_mask.dst.protonum = a->mask.dst.protonum
				1023	& b->mask.dst.protonum;
				1024
				1025	for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
				1026	intersect_mask.src.u3.all[count] =
				1027	a->mask.src.u3.all[count] & b->mask.src.u3.all[count];
				1028	}
				1029
				1030	for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
				1031	intersect_mask.dst.u3.all[count] =
				1032	a->mask.dst.u3.all[count] & b->mask.dst.u3.all[count];
				1033	}
				1034
				1035	return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
				1036	}
				1037
				1038	static inline int expect_matches(const struct nf_conntrack_expect *a,
				1039	const struct nf_conntrack_expect *b)
				1040	{
				1041	return a->master == b->master
				1042	&& nf_ct_tuple_equal(&a->tuple, &b->tuple)
				1043	&& nf_ct_tuple_equal(&a->mask, &b->mask);
				1044	}
				1045
				1046	/* Generally a bad idea to call this: could have matched already. */
				1047	void nf_conntrack_unexpect_related(struct nf_conntrack_expect *exp)
				1048	{
				1049	struct nf_conntrack_expect *i;
				1050
				1051	write_lock_bh(&nf_conntrack_lock);
				1052	/* choose the the oldest expectation to evict */
				1053	list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
				1054	if (expect_matches(i, exp) && del_timer(&i->timeout)) {
				1055	nf_ct_unlink_expect(i);
				1056	write_unlock_bh(&nf_conntrack_lock);
				1057	nf_conntrack_expect_put(i);
				1058	return;
				1059	}
				1060	}
				1061	write_unlock_bh(&nf_conntrack_lock);
				1062	}
				1063
				1064	/* We don't increase the master conntrack refcount for non-fulfilled
				1065	* conntracks. During the conntrack destruction, the expectations are
				1066	* always killed before the conntrack itself */
				1067	struct nf_conntrack_expect nf_conntrack_expect_alloc(struct nf_conn me)
				1068	{
				1069	struct nf_conntrack_expect *new;
				1070
				1071	new = kmem_cache_alloc(nf_conntrack_expect_cachep, GFP_ATOMIC);
				1072	if (!new) {
				1073	DEBUGP("expect_related: OOM allocating expect\n");
				1074	return NULL;
				1075	}
				1076	new->master = me;
				1077	atomic_set(&new->use, 1);
				1078	return new;
				1079	}
				1080
				1081	void nf_conntrack_expect_put(struct nf_conntrack_expect *exp)
				1082	{
				1083	if (atomic_dec_and_test(&exp->use))
				1084	kmem_cache_free(nf_conntrack_expect_cachep, exp);
				1085	}
				1086
				1087	static void nf_conntrack_expect_insert(struct nf_conntrack_expect *exp)
				1088	{
				1089	atomic_inc(&exp->use);
				1090	exp->master->expecting++;
				1091	list_add(&exp->list, &nf_conntrack_expect_list);
				1092
				1093	init_timer(&exp->timeout);
				1094	exp->timeout.data = (unsigned long)exp;
				1095	exp->timeout.function = expectation_timed_out;
				1096	exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
				1097	add_timer(&exp->timeout);
				1098
				1099	atomic_inc(&exp->use);
				1100	NF_CT_STAT_INC(expect_create);
				1101	}
				1102
				1103	/* Race with expectations being used means we could have none to find; OK. */
				1104	static void evict_oldest_expect(struct nf_conn *master)
				1105	{
				1106	struct nf_conntrack_expect *i;
				1107
				1108	list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
				1109	if (i->master == master) {
				1110	if (del_timer(&i->timeout)) {
				1111	nf_ct_unlink_expect(i);
				1112	nf_conntrack_expect_put(i);
				1113	}
				1114	break;
				1115	}
				1116	}
				1117	}
				1118
				1119	static inline int refresh_timer(struct nf_conntrack_expect *i)
				1120	{
				1121	if (!del_timer(&i->timeout))
				1122	return 0;
				1123
				1124	i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
				1125	add_timer(&i->timeout);
				1126	return 1;
				1127	}
				1128
				1129	int nf_conntrack_expect_related(struct nf_conntrack_expect *expect)
				1130	{
				1131	struct nf_conntrack_expect *i;
				1132	int ret;
				1133
				1134	DEBUGP("nf_conntrack_expect_related %p\n", related_to);
				1135	DEBUGP("tuple: "); NF_CT_DUMP_TUPLE(&expect->tuple);
				1136	DEBUGP("mask: "); NF_CT_DUMP_TUPLE(&expect->mask);
				1137
				1138	write_lock_bh(&nf_conntrack_lock);
				1139	list_for_each_entry(i, &nf_conntrack_expect_list, list) {
				1140	if (expect_matches(i, expect)) {
				1141	/* Refresh timer: if it's dying, ignore.. */
				1142	if (refresh_timer(i)) {
				1143	ret = 0;
				1144	goto out;
				1145	}
				1146	} else if (expect_clash(i, expect)) {
				1147	ret = -EBUSY;
				1148	goto out;
				1149	}
				1150	}
				1151	/* Will be over limit? */
				1152	if (expect->master->helper->max_expected &&
				1153	expect->master->expecting >= expect->master->helper->max_expected)
				1154	evict_oldest_expect(expect->master);
				1155
				1156	nf_conntrack_expect_insert(expect);
				1157	nf_conntrack_expect_event(IPEXP_NEW, expect);
				1158	ret = 0;
				1159	out:
				1160	write_unlock_bh(&nf_conntrack_lock);
				1161	return ret;
				1162	}
				1163
				1164	/* Alter reply tuple (maybe alter helper). This is for NAT, and is
				1165	implicitly racy: see __nf_conntrack_confirm */
				1166	void nf_conntrack_alter_reply(struct nf_conn *conntrack,
				1167	const struct nf_conntrack_tuple *newreply)
				1168	{
				1169	write_lock_bh(&nf_conntrack_lock);
				1170	/* Should be unconfirmed, so not in hash table yet */
				1171	NF_CT_ASSERT(!nf_ct_is_confirmed(conntrack));
				1172
				1173	DEBUGP("Altering reply tuple of %p to ", conntrack);
				1174	NF_CT_DUMP_TUPLE(newreply);
				1175
				1176	conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
				1177	if (!conntrack->master && conntrack->expecting == 0)
				1178	conntrack->helper = nf_ct_find_helper(newreply);
				1179	write_unlock_bh(&nf_conntrack_lock);
				1180	}
				1181
				1182	int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
				1183	{
				1184	int ret;
				1185	BUG_ON(me->timeout == 0);
				1186
				1187	ret = nf_conntrack_register_cache(NF_CT_F_HELP, "nf_conntrack:help",
				1188	sizeof(struct nf_conn)
				1189	+ sizeof(union nf_conntrack_help)
				1190	+ __alignof__(union nf_conntrack_help),
				1191	init_conntrack_for_helper);
				1192	if (ret < 0) {
				1193	printk(KERN_ERR "nf_conntrack_helper_reigster: Unable to create slab cache for conntracks\n");
				1194	return ret;
				1195	}
				1196	write_lock_bh(&nf_conntrack_lock);
				1197	list_prepend(&helpers, me);
				1198	write_unlock_bh(&nf_conntrack_lock);
				1199
				1200	return 0;
				1201	}
				1202
				1203	static inline int unhelp(struct nf_conntrack_tuple_hash *i,
				1204	const struct nf_conntrack_helper *me)
				1205	{
				1206	if (nf_ct_tuplehash_to_ctrack(i)->helper == me) {
				1207	nf_conntrack_event(IPCT_HELPER, nf_ct_tuplehash_to_ctrack(i));
				1208	nf_ct_tuplehash_to_ctrack(i)->helper = NULL;
				1209	}
				1210	return 0;
				1211	}
				1212
				1213	void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
				1214	{
				1215	unsigned int i;
				1216	struct nf_conntrack_expect exp, tmp;
				1217
				1218	/* Need write lock here, to delete helper. */
				1219	write_lock_bh(&nf_conntrack_lock);
				1220	LIST_DELETE(&helpers, me);
				1221
				1222	/* Get rid of expectations */
				1223	list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list, list) {
				1224	if (exp->master->helper == me && del_timer(&exp->timeout)) {
				1225	nf_ct_unlink_expect(exp);
				1226	nf_conntrack_expect_put(exp);
				1227	}
				1228	}
				1229
				1230	/* Get rid of expecteds, set helpers to NULL. */
				1231	LIST_FIND_W(&unconfirmed, unhelp, struct nf_conntrack_tuple_hash*, me);
				1232	for (i = 0; i < nf_conntrack_htable_size; i++)
				1233	LIST_FIND_W(&nf_conntrack_hash[i], unhelp,
				1234	struct nf_conntrack_tuple_hash *, me);
				1235	write_unlock_bh(&nf_conntrack_lock);
				1236
				1237	/* Someone could be still looking at the helper in a bh. */
				1238	synchronize_net();
				1239	}
				1240
				1241	/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
				1242	void __nf_ct_refresh_acct(struct nf_conn *ct,
				1243	enum ip_conntrack_info ctinfo,
				1244	const struct sk_buff *skb,
				1245	unsigned long extra_jiffies,
				1246	int do_acct)
				1247	{
				1248	int event = 0;
				1249
				1250	NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
				1251	NF_CT_ASSERT(skb);
				1252
				1253	write_lock_bh(&nf_conntrack_lock);
				1254
				1255	/* If not in hash table, timer will not be active yet */
				1256	if (!nf_ct_is_confirmed(ct)) {
				1257	ct->timeout.expires = extra_jiffies;
				1258	event = IPCT_REFRESH;
				1259	} else {
				1260	/* Need del_timer for race avoidance (may already be dying). */
				1261	if (del_timer(&ct->timeout)) {
				1262	ct->timeout.expires = jiffies + extra_jiffies;
				1263	add_timer(&ct->timeout);
				1264	event = IPCT_REFRESH;
				1265	}
				1266	}
				1267
				1268	#ifdef CONFIG_NF_CT_ACCT
				1269	if (do_acct) {
				1270	ct->counters[CTINFO2DIR(ctinfo)].packets++;
				1271	ct->counters[CTINFO2DIR(ctinfo)].bytes +=
				1272	skb->len - (unsigned int)(skb->nh.raw - skb->data);
				1273	if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
				1274	\|\| (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
				1275	event \|= IPCT_COUNTER_FILLING;
				1276	}
				1277	#endif
				1278
				1279	write_unlock_bh(&nf_conntrack_lock);
				1280
				1281	/* must be unlocked when calling event cache */
				1282	if (event)
				1283	nf_conntrack_event_cache(event, skb);
				1284	}
				1285
				1286	/* Used by ipt_REJECT and ip6t_REJECT. */
				1287	void __nf_conntrack_attach(struct sk_buff nskb, struct sk_buff skb)
				1288	{
				1289	struct nf_conn *ct;
				1290	enum ip_conntrack_info ctinfo;
				1291
				1292	/* This ICMP is in reverse direction to the packet which caused it */
				1293	ct = nf_ct_get(skb, &ctinfo);
				1294	if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
				1295	ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
				1296	else
				1297	ctinfo = IP_CT_RELATED;
				1298
				1299	/* Attach to new skbuff, and increment count */
				1300	nskb->nfct = &ct->ct_general;
				1301	nskb->nfctinfo = ctinfo;
				1302	nf_conntrack_get(nskb->nfct);
				1303	}
				1304
				1305	static inline int
				1306	do_iter(const struct nf_conntrack_tuple_hash *i,
				1307	int (iter)(struct nf_conn i, void *data),
				1308	void *data)
				1309	{
				1310	return iter(nf_ct_tuplehash_to_ctrack(i), data);
				1311	}
				1312
				1313	/* Bring out ya dead! */
				1314	static struct nf_conntrack_tuple_hash *
				1315	get_next_corpse(int (iter)(struct nf_conn i, void *data),
				1316	void data, unsigned int bucket)
				1317	{
				1318	struct nf_conntrack_tuple_hash *h = NULL;
				1319
				1320	write_lock_bh(&nf_conntrack_lock);
				1321	for (; bucket < nf_conntrack_htable_size; (bucket)++) {
				1322	h = LIST_FIND_W(&nf_conntrack_hash[*bucket], do_iter,
				1323	struct nf_conntrack_tuple_hash *, iter, data);
				1324	if (h)
				1325	break;
				1326	}
				1327	if (!h)
				1328	h = LIST_FIND_W(&unconfirmed, do_iter,
				1329	struct nf_conntrack_tuple_hash *, iter, data);
				1330	if (h)
				1331	atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
				1332	write_unlock_bh(&nf_conntrack_lock);
				1333
				1334	return h;
				1335	}
				1336
				1337	void
				1338	nf_ct_iterate_cleanup(int (iter)(struct nf_conn i, void data), void data)
				1339	{
				1340	struct nf_conntrack_tuple_hash *h;
				1341	unsigned int bucket = 0;
				1342
				1343	while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
				1344	struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
				1345	/* Time to push up daises... */
				1346	if (del_timer(&ct->timeout))
				1347	death_by_timeout((unsigned long)ct);
				1348	/* ... else the timer will get him soon. */
				1349
				1350	nf_ct_put(ct);
				1351	}
				1352	}
				1353
				1354	static int kill_all(struct nf_conn i, void data)
				1355	{
				1356	return 1;
				1357	}
				1358
				1359	static void free_conntrack_hash(struct list_head *hash, int vmalloced, int size)
				1360	{
				1361	if (vmalloced)
				1362	vfree(hash);
				1363	else
				1364	free_pages((unsigned long)hash,
				1365	get_order(sizeof(struct list_head) * size));
				1366	}
				1367
				1368	/* Mishearing the voices in his head, our hero wonders how he's
				1369	supposed to kill the mall. */
				1370	void nf_conntrack_cleanup(void)
				1371	{
				1372	int i;
				1373
				1374	/* This makes sure all current packets have passed through
				1375	netfilter framework. Roll on, two-stage module
				1376	delete... */
				1377	synchronize_net();
				1378
				1379	nf_ct_event_cache_flush();
				1380	i_see_dead_people:
				1381	nf_ct_iterate_cleanup(kill_all, NULL);
				1382	if (atomic_read(&nf_conntrack_count) != 0) {
				1383	schedule();
				1384	goto i_see_dead_people;
				1385	}
				1386
				1387	for (i = 0; i < NF_CT_F_NUM; i++) {
				1388	if (nf_ct_cache[i].use == 0)
				1389	continue;
				1390
				1391	NF_CT_ASSERT(nf_ct_cache[i].use == 1);
				1392	nf_ct_cache[i].use = 1;
				1393	nf_conntrack_unregister_cache(i);
				1394	}
				1395	kmem_cache_destroy(nf_conntrack_expect_cachep);
				1396	free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
				1397	nf_conntrack_htable_size);
KOVACS Krisztian	5a6f294	2005-11-15 16:47:34 -0800	[diff] [blame^]	1398
				1399	/* free l3proto protocol tables */
				1400	for (i = 0; i < PF_MAX; i++)
				1401	if (nf_ct_protos[i]) {
				1402	kfree(nf_ct_protos[i]);
				1403	nf_ct_protos[i] = NULL;
				1404	}
Yasuyuki Kozakai	9fb9cbb	2005-11-09 16:38:16 -0800	[diff] [blame]	1405	}
				1406
				1407	static struct list_head alloc_hashtable(int size, int vmalloced)
				1408	{
				1409	struct list_head *hash;
				1410	unsigned int i;
				1411
				1412	*vmalloced = 0;
				1413	hash = (void*)__get_free_pages(GFP_KERNEL,
				1414	get_order(sizeof(struct list_head)
				1415	* size));
				1416	if (!hash) {
				1417	*vmalloced = 1;
				1418	printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
				1419	hash = vmalloc(sizeof(struct list_head) * size);
				1420	}
				1421
				1422	if (hash)
				1423	for (i = 0; i < size; i++)
				1424	INIT_LIST_HEAD(&hash[i]);
				1425
				1426	return hash;
				1427	}
				1428
				1429	int set_hashsize(const char val, struct kernel_param kp)
				1430	{
				1431	int i, bucket, hashsize, vmalloced;
				1432	int old_vmalloced, old_size;
				1433	int rnd;
				1434	struct list_head hash, old_hash;
				1435	struct nf_conntrack_tuple_hash *h;
				1436
				1437	/* On boot, we can set this without any fancy locking. */
				1438	if (!nf_conntrack_htable_size)
				1439	return param_set_uint(val, kp);
				1440
				1441	hashsize = simple_strtol(val, NULL, 0);
				1442	if (!hashsize)
				1443	return -EINVAL;
				1444
				1445	hash = alloc_hashtable(hashsize, &vmalloced);
				1446	if (!hash)
				1447	return -ENOMEM;
				1448
				1449	/* We have to rehahs for the new table anyway, so we also can
				1450	* use a newrandom seed */
				1451	get_random_bytes(&rnd, 4);
				1452
				1453	write_lock_bh(&nf_conntrack_lock);
				1454	for (i = 0; i < nf_conntrack_htable_size; i++) {
				1455	while (!list_empty(&nf_conntrack_hash[i])) {
				1456	h = list_entry(nf_conntrack_hash[i].next,
				1457	struct nf_conntrack_tuple_hash, list);
				1458	list_del(&h->list);
				1459	bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
				1460	list_add_tail(&h->list, &hash[bucket]);
				1461	}
				1462	}
				1463	old_size = nf_conntrack_htable_size;
				1464	old_vmalloced = nf_conntrack_vmalloc;
				1465	old_hash = nf_conntrack_hash;
				1466
				1467	nf_conntrack_htable_size = hashsize;
				1468	nf_conntrack_vmalloc = vmalloced;
				1469	nf_conntrack_hash = hash;
				1470	nf_conntrack_hash_rnd = rnd;
				1471	write_unlock_bh(&nf_conntrack_lock);
				1472
				1473	free_conntrack_hash(old_hash, old_vmalloced, old_size);
				1474	return 0;
				1475	}
				1476
				1477	module_param_call(hashsize, set_hashsize, param_get_uint,
				1478	&nf_conntrack_htable_size, 0600);
				1479
				1480	int __init nf_conntrack_init(void)
				1481	{
				1482	unsigned int i;
				1483	int ret;
				1484
				1485	/* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
				1486	* machine has 256 buckets. >= 1GB machines have 8192 buckets. */
				1487	if (!nf_conntrack_htable_size) {
				1488	nf_conntrack_htable_size
				1489	= (((num_physpages << PAGE_SHIFT) / 16384)
				1490	/ sizeof(struct list_head));
				1491	if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
				1492	nf_conntrack_htable_size = 8192;
				1493	if (nf_conntrack_htable_size < 16)
				1494	nf_conntrack_htable_size = 16;
				1495	}
				1496	nf_conntrack_max = 8 * nf_conntrack_htable_size;
				1497
				1498	printk("nf_conntrack version %s (%u buckets, %d max)\n",
				1499	NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
				1500	nf_conntrack_max);
				1501
				1502	nf_conntrack_hash = alloc_hashtable(nf_conntrack_htable_size,
				1503	&nf_conntrack_vmalloc);
				1504	if (!nf_conntrack_hash) {
				1505	printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
				1506	goto err_out;
				1507	}
				1508
				1509	ret = nf_conntrack_register_cache(NF_CT_F_BASIC, "nf_conntrack:basic",
				1510	sizeof(struct nf_conn), NULL);
				1511	if (ret < 0) {
				1512	printk(KERN_ERR "Unable to create nf_conn slab cache\n");
				1513	goto err_free_hash;
				1514	}
				1515
				1516	nf_conntrack_expect_cachep = kmem_cache_create("nf_conntrack_expect",
				1517	sizeof(struct nf_conntrack_expect),
				1518	0, 0, NULL, NULL);
				1519	if (!nf_conntrack_expect_cachep) {
				1520	printk(KERN_ERR "Unable to create nf_expect slab cache\n");
				1521	goto err_free_conntrack_slab;
				1522	}
				1523
				1524	/* Don't NEED lock here, but good form anyway. */
				1525	write_lock_bh(&nf_conntrack_lock);
				1526	for (i = 0; i < PF_MAX; i++)
				1527	nf_ct_l3protos[i] = &nf_conntrack_generic_l3proto;
				1528	write_unlock_bh(&nf_conntrack_lock);
				1529
				1530	/* Set up fake conntrack:
				1531	- to never be deleted, not in any hashes */
				1532	atomic_set(&nf_conntrack_untracked.ct_general.use, 1);
				1533	/* - and look it like as a confirmed connection */
				1534	set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);
				1535
				1536	return ret;
				1537
				1538	err_free_conntrack_slab:
				1539	nf_conntrack_unregister_cache(NF_CT_F_BASIC);
				1540	err_free_hash:
				1541	free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
				1542	nf_conntrack_htable_size);
				1543	err_out:
				1544	return -ENOMEM;
				1545	}