blob: 396c631166a43e849d5cd8aee0fd0b65d2cd7a82 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9 *
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 *
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090023 * Alan Cox : Super /proc >4K
Linus Torvalds1da177e2005-04-16 15:20:36 -070024 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090041 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070042 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070057 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Ilia Sotnikovcef26852006-03-25 01:38:55 -080058 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
59 * Ilia Sotnikov : Removed TOS from hash calculations
Linus Torvalds1da177e2005-04-16 15:20:36 -070060 *
61 * This program is free software; you can redistribute it and/or
62 * modify it under the terms of the GNU General Public License
63 * as published by the Free Software Foundation; either version
64 * 2 of the License, or (at your option) any later version.
65 */
66
Linus Torvalds1da177e2005-04-16 15:20:36 -070067#include <linux/module.h>
68#include <asm/uaccess.h>
69#include <asm/system.h>
70#include <linux/bitops.h>
71#include <linux/types.h>
72#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070073#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070074#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070075#include <linux/string.h>
76#include <linux/socket.h>
77#include <linux/sockios.h>
78#include <linux/errno.h>
79#include <linux/in.h>
80#include <linux/inet.h>
81#include <linux/netdevice.h>
82#include <linux/proc_fs.h>
83#include <linux/init.h>
84#include <linux/skbuff.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070085#include <linux/inetdevice.h>
86#include <linux/igmp.h>
87#include <linux/pkt_sched.h>
88#include <linux/mroute.h>
89#include <linux/netfilter_ipv4.h>
90#include <linux/random.h>
91#include <linux/jhash.h>
92#include <linux/rcupdate.h>
93#include <linux/times.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020094#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070095#include <net/protocol.h>
96#include <net/ip.h>
97#include <net/route.h>
98#include <net/inetpeer.h>
99#include <net/sock.h>
100#include <net/ip_fib.h>
101#include <net/arp.h>
102#include <net/tcp.h>
103#include <net/icmp.h>
104#include <net/xfrm.h>
Tom Tucker8d717402006-07-30 20:43:36 -0700105#include <net/netevent.h>
Thomas Graf63f34442007-03-22 11:55:17 -0700106#include <net/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700107#ifdef CONFIG_SYSCTL
108#include <linux/sysctl.h>
109#endif
110
111#define RT_FL_TOS(oldflp) \
112 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113
114#define IP_MAX_MTU 0xFFF0
115
116#define RT_GC_TIMEOUT (300*HZ)
117
118static int ip_rt_min_delay = 2 * HZ;
119static int ip_rt_max_delay = 10 * HZ;
120static int ip_rt_max_size;
121static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
122static int ip_rt_gc_interval = 60 * HZ;
123static int ip_rt_gc_min_interval = HZ / 2;
124static int ip_rt_redirect_number = 9;
125static int ip_rt_redirect_load = HZ / 50;
126static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
127static int ip_rt_error_cost = HZ;
128static int ip_rt_error_burst = 5 * HZ;
129static int ip_rt_gc_elasticity = 8;
130static int ip_rt_mtu_expires = 10 * 60 * HZ;
131static int ip_rt_min_pmtu = 512 + 20 + 20;
132static int ip_rt_min_advmss = 256;
133static int ip_rt_secret_interval = 10 * 60 * HZ;
134static unsigned long rt_deadline;
135
136#define RTprint(a...) printk(KERN_DEBUG a)
137
138static struct timer_list rt_flush_timer;
139static struct timer_list rt_periodic_timer;
140static struct timer_list rt_secret_timer;
141
142/*
143 * Interface to generic destination cache.
144 */
145
146static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
147static void ipv4_dst_destroy(struct dst_entry *dst);
148static void ipv4_dst_ifdown(struct dst_entry *dst,
149 struct net_device *dev, int how);
150static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
151static void ipv4_link_failure(struct sk_buff *skb);
152static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
153static int rt_garbage_collect(void);
154
155
156static struct dst_ops ipv4_dst_ops = {
157 .family = AF_INET,
158 .protocol = __constant_htons(ETH_P_IP),
159 .gc = rt_garbage_collect,
160 .check = ipv4_dst_check,
161 .destroy = ipv4_dst_destroy,
162 .ifdown = ipv4_dst_ifdown,
163 .negative_advice = ipv4_negative_advice,
164 .link_failure = ipv4_link_failure,
165 .update_pmtu = ip_rt_update_pmtu,
166 .entry_size = sizeof(struct rtable),
167};
168
169#define ECN_OR_COST(class) TC_PRIO_##class
170
Philippe De Muyter4839c522007-07-09 15:32:57 -0700171const __u8 ip_tos2prio[16] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700172 TC_PRIO_BESTEFFORT,
173 ECN_OR_COST(FILLER),
174 TC_PRIO_BESTEFFORT,
175 ECN_OR_COST(BESTEFFORT),
176 TC_PRIO_BULK,
177 ECN_OR_COST(BULK),
178 TC_PRIO_BULK,
179 ECN_OR_COST(BULK),
180 TC_PRIO_INTERACTIVE,
181 ECN_OR_COST(INTERACTIVE),
182 TC_PRIO_INTERACTIVE,
183 ECN_OR_COST(INTERACTIVE),
184 TC_PRIO_INTERACTIVE_BULK,
185 ECN_OR_COST(INTERACTIVE_BULK),
186 TC_PRIO_INTERACTIVE_BULK,
187 ECN_OR_COST(INTERACTIVE_BULK)
188};
189
190
191/*
192 * Route cache.
193 */
194
195/* The locking scheme is rather straight forward:
196 *
197 * 1) Read-Copy Update protects the buckets of the central route hash.
198 * 2) Only writers remove entries, and they hold the lock
199 * as they look at rtable reference counts.
200 * 3) Only readers acquire references to rtable entries,
201 * they do so with atomic increments and with the
202 * lock held.
203 */
204
205struct rt_hash_bucket {
206 struct rtable *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700207};
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700208#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
209 defined(CONFIG_PROVE_LOCKING)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700210/*
211 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
212 * The size of this table is a power of two and depends on the number of CPUS.
Ingo Molnar62051202006-07-03 00:24:59 -0700213 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700214 */
Ingo Molnar62051202006-07-03 00:24:59 -0700215#ifdef CONFIG_LOCKDEP
216# define RT_HASH_LOCK_SZ 256
Eric Dumazet22c047c2005-07-05 14:55:24 -0700217#else
Ingo Molnar62051202006-07-03 00:24:59 -0700218# if NR_CPUS >= 32
219# define RT_HASH_LOCK_SZ 4096
220# elif NR_CPUS >= 16
221# define RT_HASH_LOCK_SZ 2048
222# elif NR_CPUS >= 8
223# define RT_HASH_LOCK_SZ 1024
224# elif NR_CPUS >= 4
225# define RT_HASH_LOCK_SZ 512
226# else
227# define RT_HASH_LOCK_SZ 256
228# endif
Eric Dumazet22c047c2005-07-05 14:55:24 -0700229#endif
230
231static spinlock_t *rt_hash_locks;
232# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
233# define rt_hash_lock_init() { \
234 int i; \
235 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
236 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
237 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
238 spin_lock_init(&rt_hash_locks[i]); \
239 }
240#else
241# define rt_hash_lock_addr(slot) NULL
242# define rt_hash_lock_init()
243#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700244
245static struct rt_hash_bucket *rt_hash_table;
246static unsigned rt_hash_mask;
247static int rt_hash_log;
248static unsigned int rt_hash_rnd;
249
Eric Dumazet2f970d82006-01-17 02:54:36 -0800250static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
Andrew Mortondbd29152006-01-17 21:58:01 -0800251#define RT_CACHE_STAT_INC(field) \
Paul Mackerrasbfe5d832006-06-25 05:47:14 -0700252 (__raw_get_cpu_var(rt_cache_stat).field++)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700253
254static int rt_intern_hash(unsigned hash, struct rtable *rth,
255 struct rtable **res);
256
Ilia Sotnikovcef26852006-03-25 01:38:55 -0800257static unsigned int rt_hash_code(u32 daddr, u32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700258{
Ilia Sotnikovcef26852006-03-25 01:38:55 -0800259 return (jhash_2words(daddr, saddr, rt_hash_rnd)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700260 & rt_hash_mask);
261}
262
Al Viro8c7bc842006-09-26 21:26:19 -0700263#define rt_hash(daddr, saddr, idx) \
264 rt_hash_code((__force u32)(__be32)(daddr),\
265 (__force u32)(__be32)(saddr) ^ ((idx) << 5))
266
Linus Torvalds1da177e2005-04-16 15:20:36 -0700267#ifdef CONFIG_PROC_FS
268struct rt_cache_iter_state {
269 int bucket;
270};
271
272static struct rtable *rt_cache_get_first(struct seq_file *seq)
273{
274 struct rtable *r = NULL;
275 struct rt_cache_iter_state *st = seq->private;
276
277 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
278 rcu_read_lock_bh();
279 r = rt_hash_table[st->bucket].chain;
280 if (r)
281 break;
282 rcu_read_unlock_bh();
283 }
284 return r;
285}
286
287static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
288{
289 struct rt_cache_iter_state *st = rcu_dereference(seq->private);
290
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800291 r = r->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700292 while (!r) {
293 rcu_read_unlock_bh();
294 if (--st->bucket < 0)
295 break;
296 rcu_read_lock_bh();
297 r = rt_hash_table[st->bucket].chain;
298 }
299 return r;
300}
301
302static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
303{
304 struct rtable *r = rt_cache_get_first(seq);
305
306 if (r)
307 while (pos && (r = rt_cache_get_next(seq, r)))
308 --pos;
309 return pos ? NULL : r;
310}
311
312static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
313{
314 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
315}
316
317static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
318{
319 struct rtable *r = NULL;
320
321 if (v == SEQ_START_TOKEN)
322 r = rt_cache_get_first(seq);
323 else
324 r = rt_cache_get_next(seq, v);
325 ++*pos;
326 return r;
327}
328
329static void rt_cache_seq_stop(struct seq_file *seq, void *v)
330{
331 if (v && v != SEQ_START_TOKEN)
332 rcu_read_unlock_bh();
333}
334
335static int rt_cache_seq_show(struct seq_file *seq, void *v)
336{
337 if (v == SEQ_START_TOKEN)
338 seq_printf(seq, "%-127s\n",
339 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
340 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
341 "HHUptod\tSpecDst");
342 else {
343 struct rtable *r = v;
344 char temp[256];
345
346 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
347 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
348 r->u.dst.dev ? r->u.dst.dev->name : "*",
349 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
350 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
351 r->u.dst.__use, 0, (unsigned long)r->rt_src,
352 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
353 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
354 dst_metric(&r->u.dst, RTAX_WINDOW),
355 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
356 dst_metric(&r->u.dst, RTAX_RTTVAR)),
357 r->fl.fl4_tos,
358 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
359 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
360 dev_queue_xmit) : 0,
361 r->rt_spec_dst);
362 seq_printf(seq, "%-127s\n", temp);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900363 }
364 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700365}
366
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700367static const struct seq_operations rt_cache_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700368 .start = rt_cache_seq_start,
369 .next = rt_cache_seq_next,
370 .stop = rt_cache_seq_stop,
371 .show = rt_cache_seq_show,
372};
373
374static int rt_cache_seq_open(struct inode *inode, struct file *file)
375{
376 struct seq_file *seq;
377 int rc = -ENOMEM;
Mariusz Kozlowski1bcabbd2007-08-01 21:54:27 -0700378 struct rt_cache_iter_state *s;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700379
Mariusz Kozlowski1bcabbd2007-08-01 21:54:27 -0700380 s = kzalloc(sizeof(*s), GFP_KERNEL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700381 if (!s)
382 goto out;
383 rc = seq_open(file, &rt_cache_seq_ops);
384 if (rc)
385 goto out_kfree;
386 seq = file->private_data;
387 seq->private = s;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700388out:
389 return rc;
390out_kfree:
391 kfree(s);
392 goto out;
393}
394
Arjan van de Ven9a321442007-02-12 00:55:35 -0800395static const struct file_operations rt_cache_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700396 .owner = THIS_MODULE,
397 .open = rt_cache_seq_open,
398 .read = seq_read,
399 .llseek = seq_lseek,
400 .release = seq_release_private,
401};
402
403
404static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
405{
406 int cpu;
407
408 if (*pos == 0)
409 return SEQ_START_TOKEN;
410
411 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
412 if (!cpu_possible(cpu))
413 continue;
414 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800415 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700416 }
417 return NULL;
418}
419
420static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
421{
422 int cpu;
423
424 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
425 if (!cpu_possible(cpu))
426 continue;
427 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800428 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700429 }
430 return NULL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900431
Linus Torvalds1da177e2005-04-16 15:20:36 -0700432}
433
434static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
435{
436
437}
438
439static int rt_cpu_seq_show(struct seq_file *seq, void *v)
440{
441 struct rt_cache_stat *st = v;
442
443 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700444 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700445 return 0;
446 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900447
Linus Torvalds1da177e2005-04-16 15:20:36 -0700448 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
449 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
450 atomic_read(&ipv4_dst_ops.entries),
451 st->in_hit,
452 st->in_slow_tot,
453 st->in_slow_mc,
454 st->in_no_route,
455 st->in_brd,
456 st->in_martian_dst,
457 st->in_martian_src,
458
459 st->out_hit,
460 st->out_slow_tot,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900461 st->out_slow_mc,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700462
463 st->gc_total,
464 st->gc_ignored,
465 st->gc_goal_miss,
466 st->gc_dst_overflow,
467 st->in_hlist_search,
468 st->out_hlist_search
469 );
470 return 0;
471}
472
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700473static const struct seq_operations rt_cpu_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700474 .start = rt_cpu_seq_start,
475 .next = rt_cpu_seq_next,
476 .stop = rt_cpu_seq_stop,
477 .show = rt_cpu_seq_show,
478};
479
480
481static int rt_cpu_seq_open(struct inode *inode, struct file *file)
482{
483 return seq_open(file, &rt_cpu_seq_ops);
484}
485
Arjan van de Ven9a321442007-02-12 00:55:35 -0800486static const struct file_operations rt_cpu_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700487 .owner = THIS_MODULE,
488 .open = rt_cpu_seq_open,
489 .read = seq_read,
490 .llseek = seq_lseek,
491 .release = seq_release,
492};
493
494#endif /* CONFIG_PROC_FS */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900495
Linus Torvalds1da177e2005-04-16 15:20:36 -0700496static __inline__ void rt_free(struct rtable *rt)
497{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700498 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
499}
500
501static __inline__ void rt_drop(struct rtable *rt)
502{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700503 ip_rt_put(rt);
504 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
505}
506
507static __inline__ int rt_fast_clean(struct rtable *rth)
508{
509 /* Kill broadcast/multicast entries very aggresively, if they
510 collide in hash table with more useful entries */
511 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800512 rth->fl.iif && rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700513}
514
515static __inline__ int rt_valuable(struct rtable *rth)
516{
517 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
518 rth->u.dst.expires;
519}
520
521static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
522{
523 unsigned long age;
524 int ret = 0;
525
526 if (atomic_read(&rth->u.dst.__refcnt))
527 goto out;
528
529 ret = 1;
530 if (rth->u.dst.expires &&
531 time_after_eq(jiffies, rth->u.dst.expires))
532 goto out;
533
534 age = jiffies - rth->u.dst.lastuse;
535 ret = 0;
536 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
537 (age <= tmo2 && rt_valuable(rth)))
538 goto out;
539 ret = 1;
540out: return ret;
541}
542
543/* Bits of score are:
544 * 31: very valuable
545 * 30: not quite useless
546 * 29..0: usage counter
547 */
548static inline u32 rt_score(struct rtable *rt)
549{
550 u32 score = jiffies - rt->u.dst.lastuse;
551
552 score = ~score & ~(3<<30);
553
554 if (rt_valuable(rt))
555 score |= (1<<31);
556
557 if (!rt->fl.iif ||
558 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
559 score |= (1<<30);
560
561 return score;
562}
563
564static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
565{
Al Viro714e85b2006-11-14 20:51:49 -0800566 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
567 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
Thomas Graf47dcf0c2006-11-09 15:20:38 -0800568 (fl1->mark ^ fl2->mark) |
David S. Miller8238b212006-10-12 00:49:15 -0700569 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
570 *(u16 *)&fl2->nl_u.ip4_u.tos) |
571 (fl1->oif ^ fl2->oif) |
572 (fl1->iif ^ fl2->iif)) == 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700573}
574
Linus Torvalds1da177e2005-04-16 15:20:36 -0700575/* This runs via a timer and thus is always in BH context. */
576static void rt_check_expire(unsigned long dummy)
577{
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700578 static unsigned int rover;
579 unsigned int i = rover, goal;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700580 struct rtable *rth, **rthp;
581 unsigned long now = jiffies;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700582 u64 mult;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700583
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700584 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
585 if (ip_rt_gc_timeout > 1)
586 do_div(mult, ip_rt_gc_timeout);
587 goal = (unsigned int)mult;
588 if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
589 for (; goal > 0; goal--) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700590 unsigned long tmo = ip_rt_gc_timeout;
591
592 i = (i + 1) & rt_hash_mask;
593 rthp = &rt_hash_table[i].chain;
594
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700595 if (*rthp == 0)
596 continue;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700597 spin_lock(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700598 while ((rth = *rthp) != NULL) {
599 if (rth->u.dst.expires) {
600 /* Entry is expired even if it is in use */
601 if (time_before_eq(now, rth->u.dst.expires)) {
602 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800603 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700604 continue;
605 }
606 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
607 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800608 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700609 continue;
610 }
611
612 /* Cleanup aged off entries. */
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800613 *rthp = rth->u.dst.rt_next;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900614 rt_free(rth);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700615 }
Eric Dumazet22c047c2005-07-05 14:55:24 -0700616 spin_unlock(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700617
618 /* Fallback loop breaker. */
619 if (time_after(jiffies, now))
620 break;
621 }
622 rover = i;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700623 mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700624}
625
626/* This can run from both BH and non-BH contexts, the latter
627 * in the case of a forced flush event.
628 */
629static void rt_run_flush(unsigned long dummy)
630{
631 int i;
632 struct rtable *rth, *next;
633
634 rt_deadline = 0;
635
636 get_random_bytes(&rt_hash_rnd, 4);
637
638 for (i = rt_hash_mask; i >= 0; i--) {
Eric Dumazet22c047c2005-07-05 14:55:24 -0700639 spin_lock_bh(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700640 rth = rt_hash_table[i].chain;
641 if (rth)
642 rt_hash_table[i].chain = NULL;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700643 spin_unlock_bh(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700644
645 for (; rth; rth = next) {
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800646 next = rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700647 rt_free(rth);
648 }
649 }
650}
651
652static DEFINE_SPINLOCK(rt_flush_lock);
653
654void rt_cache_flush(int delay)
655{
656 unsigned long now = jiffies;
657 int user_mode = !in_softirq();
658
659 if (delay < 0)
660 delay = ip_rt_min_delay;
661
Linus Torvalds1da177e2005-04-16 15:20:36 -0700662 spin_lock_bh(&rt_flush_lock);
663
664 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
665 long tmo = (long)(rt_deadline - now);
666
667 /* If flush timer is already running
668 and flush request is not immediate (delay > 0):
669
670 if deadline is not achieved, prolongate timer to "delay",
671 otherwise fire it at deadline time.
672 */
673
674 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
675 tmo = 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900676
Linus Torvalds1da177e2005-04-16 15:20:36 -0700677 if (delay > tmo)
678 delay = tmo;
679 }
680
681 if (delay <= 0) {
682 spin_unlock_bh(&rt_flush_lock);
683 rt_run_flush(0);
684 return;
685 }
686
687 if (rt_deadline == 0)
688 rt_deadline = now + ip_rt_max_delay;
689
690 mod_timer(&rt_flush_timer, now+delay);
691 spin_unlock_bh(&rt_flush_lock);
692}
693
694static void rt_secret_rebuild(unsigned long dummy)
695{
696 unsigned long now = jiffies;
697
698 rt_cache_flush(0);
699 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
700}
701
702/*
703 Short description of GC goals.
704
705 We want to build algorithm, which will keep routing cache
706 at some equilibrium point, when number of aged off entries
707 is kept approximately equal to newly generated ones.
708
709 Current expiration strength is variable "expire".
710 We try to adjust it dynamically, so that if networking
711 is idle expires is large enough to keep enough of warm entries,
712 and when load increases it reduces to limit cache size.
713 */
714
715static int rt_garbage_collect(void)
716{
717 static unsigned long expire = RT_GC_TIMEOUT;
718 static unsigned long last_gc;
719 static int rover;
720 static int equilibrium;
721 struct rtable *rth, **rthp;
722 unsigned long now = jiffies;
723 int goal;
724
725 /*
726 * Garbage collection is pretty expensive,
727 * do not make it too frequently.
728 */
729
730 RT_CACHE_STAT_INC(gc_total);
731
732 if (now - last_gc < ip_rt_gc_min_interval &&
733 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
734 RT_CACHE_STAT_INC(gc_ignored);
735 goto out;
736 }
737
738 /* Calculate number of entries, which we want to expire now. */
739 goal = atomic_read(&ipv4_dst_ops.entries) -
740 (ip_rt_gc_elasticity << rt_hash_log);
741 if (goal <= 0) {
742 if (equilibrium < ipv4_dst_ops.gc_thresh)
743 equilibrium = ipv4_dst_ops.gc_thresh;
744 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
745 if (goal > 0) {
746 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
747 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
748 }
749 } else {
750 /* We are in dangerous area. Try to reduce cache really
751 * aggressively.
752 */
753 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
754 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
755 }
756
757 if (now - last_gc >= ip_rt_gc_min_interval)
758 last_gc = now;
759
760 if (goal <= 0) {
761 equilibrium += goal;
762 goto work_done;
763 }
764
765 do {
766 int i, k;
767
768 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
769 unsigned long tmo = expire;
770
771 k = (k + 1) & rt_hash_mask;
772 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700773 spin_lock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700774 while ((rth = *rthp) != NULL) {
775 if (!rt_may_expire(rth, tmo, expire)) {
776 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800777 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700778 continue;
779 }
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800780 *rthp = rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700781 rt_free(rth);
782 goal--;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700783 }
Eric Dumazet22c047c2005-07-05 14:55:24 -0700784 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700785 if (goal <= 0)
786 break;
787 }
788 rover = k;
789
790 if (goal <= 0)
791 goto work_done;
792
793 /* Goal is not achieved. We stop process if:
794
795 - if expire reduced to zero. Otherwise, expire is halfed.
796 - if table is not full.
797 - if we are called from interrupt.
798 - jiffies check is just fallback/debug loop breaker.
799 We will not spin here for long time in any case.
800 */
801
802 RT_CACHE_STAT_INC(gc_goal_miss);
803
804 if (expire == 0)
805 break;
806
807 expire >>= 1;
808#if RT_CACHE_DEBUG >= 2
809 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
810 atomic_read(&ipv4_dst_ops.entries), goal, i);
811#endif
812
813 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
814 goto out;
815 } while (!in_softirq() && time_before_eq(jiffies, now));
816
817 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
818 goto out;
819 if (net_ratelimit())
820 printk(KERN_WARNING "dst cache overflow\n");
821 RT_CACHE_STAT_INC(gc_dst_overflow);
822 return 1;
823
824work_done:
825 expire += ip_rt_gc_min_interval;
826 if (expire > ip_rt_gc_timeout ||
827 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
828 expire = ip_rt_gc_timeout;
829#if RT_CACHE_DEBUG >= 2
830 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
831 atomic_read(&ipv4_dst_ops.entries), goal, rover);
832#endif
833out: return 0;
834}
835
836static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
837{
838 struct rtable *rth, **rthp;
839 unsigned long now;
840 struct rtable *cand, **candp;
841 u32 min_score;
842 int chain_length;
843 int attempts = !in_softirq();
844
845restart:
846 chain_length = 0;
847 min_score = ~(u32)0;
848 cand = NULL;
849 candp = NULL;
850 now = jiffies;
851
852 rthp = &rt_hash_table[hash].chain;
853
Eric Dumazet22c047c2005-07-05 14:55:24 -0700854 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700855 while ((rth = *rthp) != NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700856 if (compare_keys(&rth->fl, &rt->fl)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700857 /* Put it first */
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800858 *rthp = rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700859 /*
860 * Since lookup is lockfree, the deletion
861 * must be visible to another weakly ordered CPU before
862 * the insertion at the start of the hash chain.
863 */
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800864 rcu_assign_pointer(rth->u.dst.rt_next,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700865 rt_hash_table[hash].chain);
866 /*
867 * Since lookup is lockfree, the update writes
868 * must be ordered for consistency on SMP.
869 */
870 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
871
872 rth->u.dst.__use++;
873 dst_hold(&rth->u.dst);
874 rth->u.dst.lastuse = now;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700875 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700876
877 rt_drop(rt);
878 *rp = rth;
879 return 0;
880 }
881
882 if (!atomic_read(&rth->u.dst.__refcnt)) {
883 u32 score = rt_score(rth);
884
885 if (score <= min_score) {
886 cand = rth;
887 candp = rthp;
888 min_score = score;
889 }
890 }
891
892 chain_length++;
893
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800894 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700895 }
896
897 if (cand) {
898 /* ip_rt_gc_elasticity used to be average length of chain
899 * length, when exceeded gc becomes really aggressive.
900 *
901 * The second limit is less certain. At the moment it allows
902 * only 2 entries per bucket. We will see.
903 */
904 if (chain_length > ip_rt_gc_elasticity) {
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800905 *candp = cand->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700906 rt_free(cand);
907 }
908 }
909
910 /* Try to bind route to arp only if it is output
911 route or unicast forwarding path.
912 */
913 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
914 int err = arp_bind_neighbour(&rt->u.dst);
915 if (err) {
Eric Dumazet22c047c2005-07-05 14:55:24 -0700916 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700917
918 if (err != -ENOBUFS) {
919 rt_drop(rt);
920 return err;
921 }
922
923 /* Neighbour tables are full and nothing
924 can be released. Try to shrink route cache,
925 it is most likely it holds some neighbour records.
926 */
927 if (attempts-- > 0) {
928 int saved_elasticity = ip_rt_gc_elasticity;
929 int saved_int = ip_rt_gc_min_interval;
930 ip_rt_gc_elasticity = 1;
931 ip_rt_gc_min_interval = 0;
932 rt_garbage_collect();
933 ip_rt_gc_min_interval = saved_int;
934 ip_rt_gc_elasticity = saved_elasticity;
935 goto restart;
936 }
937
938 if (net_ratelimit())
939 printk(KERN_WARNING "Neighbour table overflow.\n");
940 rt_drop(rt);
941 return -ENOBUFS;
942 }
943 }
944
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800945 rt->u.dst.rt_next = rt_hash_table[hash].chain;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700946#if RT_CACHE_DEBUG >= 2
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800947 if (rt->u.dst.rt_next) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700948 struct rtable *trt;
949 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
950 NIPQUAD(rt->rt_dst));
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800951 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700952 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
953 printk("\n");
954 }
955#endif
956 rt_hash_table[hash].chain = rt;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700957 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700958 *rp = rt;
959 return 0;
960}
961
962void rt_bind_peer(struct rtable *rt, int create)
963{
964 static DEFINE_SPINLOCK(rt_peer_lock);
965 struct inet_peer *peer;
966
967 peer = inet_getpeer(rt->rt_dst, create);
968
969 spin_lock_bh(&rt_peer_lock);
970 if (rt->peer == NULL) {
971 rt->peer = peer;
972 peer = NULL;
973 }
974 spin_unlock_bh(&rt_peer_lock);
975 if (peer)
976 inet_putpeer(peer);
977}
978
979/*
980 * Peer allocation may fail only in serious out-of-memory conditions. However
981 * we still can generate some output.
982 * Random ID selection looks a bit dangerous because we have no chances to
983 * select ID being unique in a reasonable period of time.
984 * But broken packet identifier may be better than no packet at all.
985 */
986static void ip_select_fb_ident(struct iphdr *iph)
987{
988 static DEFINE_SPINLOCK(ip_fb_id_lock);
989 static u32 ip_fallback_id;
990 u32 salt;
991
992 spin_lock_bh(&ip_fb_id_lock);
Al Viroe4485152006-09-26 22:15:01 -0700993 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700994 iph->id = htons(salt & 0xFFFF);
995 ip_fallback_id = salt;
996 spin_unlock_bh(&ip_fb_id_lock);
997}
998
999void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1000{
1001 struct rtable *rt = (struct rtable *) dst;
1002
1003 if (rt) {
1004 if (rt->peer == NULL)
1005 rt_bind_peer(rt, 1);
1006
1007 /* If peer is attached to destination, it is never detached,
1008 so that we need not to grab a lock to dereference it.
1009 */
1010 if (rt->peer) {
1011 iph->id = htons(inet_getid(rt->peer, more));
1012 return;
1013 }
1014 } else
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001015 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
Stephen Hemminger9c2b3322005-04-19 22:39:42 -07001016 __builtin_return_address(0));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001017
1018 ip_select_fb_ident(iph);
1019}
1020
1021static void rt_del(unsigned hash, struct rtable *rt)
1022{
1023 struct rtable **rthp;
1024
Eric Dumazet22c047c2005-07-05 14:55:24 -07001025 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001026 ip_rt_put(rt);
1027 for (rthp = &rt_hash_table[hash].chain; *rthp;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001028 rthp = &(*rthp)->u.dst.rt_next)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001029 if (*rthp == rt) {
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001030 *rthp = rt->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001031 rt_free(rt);
1032 break;
1033 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001034 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001035}
1036
Al Virof7655222006-09-26 21:25:43 -07001037void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1038 __be32 saddr, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001039{
1040 int i, k;
1041 struct in_device *in_dev = in_dev_get(dev);
1042 struct rtable *rth, **rthp;
Al Virof7655222006-09-26 21:25:43 -07001043 __be32 skeys[2] = { saddr, 0 };
Linus Torvalds1da177e2005-04-16 15:20:36 -07001044 int ikeys[2] = { dev->ifindex, 0 };
Tom Tucker8d717402006-07-30 20:43:36 -07001045 struct netevent_redirect netevent;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001046
Linus Torvalds1da177e2005-04-16 15:20:36 -07001047 if (!in_dev)
1048 return;
1049
1050 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1051 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1052 goto reject_redirect;
1053
1054 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1055 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1056 goto reject_redirect;
1057 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1058 goto reject_redirect;
1059 } else {
1060 if (inet_addr_type(new_gw) != RTN_UNICAST)
1061 goto reject_redirect;
1062 }
1063
1064 for (i = 0; i < 2; i++) {
1065 for (k = 0; k < 2; k++) {
Al Viro8c7bc842006-09-26 21:26:19 -07001066 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001067
1068 rthp=&rt_hash_table[hash].chain;
1069
1070 rcu_read_lock();
1071 while ((rth = rcu_dereference(*rthp)) != NULL) {
1072 struct rtable *rt;
1073
1074 if (rth->fl.fl4_dst != daddr ||
1075 rth->fl.fl4_src != skeys[i] ||
Linus Torvalds1da177e2005-04-16 15:20:36 -07001076 rth->fl.oif != ikeys[k] ||
1077 rth->fl.iif != 0) {
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001078 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001079 continue;
1080 }
1081
1082 if (rth->rt_dst != daddr ||
1083 rth->rt_src != saddr ||
1084 rth->u.dst.error ||
1085 rth->rt_gateway != old_gw ||
1086 rth->u.dst.dev != dev)
1087 break;
1088
1089 dst_hold(&rth->u.dst);
1090 rcu_read_unlock();
1091
1092 rt = dst_alloc(&ipv4_dst_ops);
1093 if (rt == NULL) {
1094 ip_rt_put(rth);
1095 in_dev_put(in_dev);
1096 return;
1097 }
1098
1099 /* Copy all the information. */
1100 *rt = *rth;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001101 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001102 rt->u.dst.__use = 1;
1103 atomic_set(&rt->u.dst.__refcnt, 1);
1104 rt->u.dst.child = NULL;
1105 if (rt->u.dst.dev)
1106 dev_hold(rt->u.dst.dev);
1107 if (rt->idev)
1108 in_dev_hold(rt->idev);
1109 rt->u.dst.obsolete = 0;
1110 rt->u.dst.lastuse = jiffies;
1111 rt->u.dst.path = &rt->u.dst;
1112 rt->u.dst.neighbour = NULL;
1113 rt->u.dst.hh = NULL;
1114 rt->u.dst.xfrm = NULL;
1115
1116 rt->rt_flags |= RTCF_REDIRECTED;
1117
1118 /* Gateway is different ... */
1119 rt->rt_gateway = new_gw;
1120
1121 /* Redirect received -> path was valid */
1122 dst_confirm(&rth->u.dst);
1123
1124 if (rt->peer)
1125 atomic_inc(&rt->peer->refcnt);
1126
1127 if (arp_bind_neighbour(&rt->u.dst) ||
1128 !(rt->u.dst.neighbour->nud_state &
1129 NUD_VALID)) {
1130 if (rt->u.dst.neighbour)
1131 neigh_event_send(rt->u.dst.neighbour, NULL);
1132 ip_rt_put(rth);
1133 rt_drop(rt);
1134 goto do_next;
1135 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001136
Tom Tucker8d717402006-07-30 20:43:36 -07001137 netevent.old = &rth->u.dst;
1138 netevent.new = &rt->u.dst;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001139 call_netevent_notifiers(NETEVENT_REDIRECT,
1140 &netevent);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001141
1142 rt_del(hash, rth);
1143 if (!rt_intern_hash(hash, rt, &rt))
1144 ip_rt_put(rt);
1145 goto do_next;
1146 }
1147 rcu_read_unlock();
1148 do_next:
1149 ;
1150 }
1151 }
1152 in_dev_put(in_dev);
1153 return;
1154
1155reject_redirect:
1156#ifdef CONFIG_IP_ROUTE_VERBOSE
1157 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1158 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1159 "%u.%u.%u.%u ignored.\n"
Ilia Sotnikovcef26852006-03-25 01:38:55 -08001160 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07001161 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
Ilia Sotnikovcef26852006-03-25 01:38:55 -08001162 NIPQUAD(saddr), NIPQUAD(daddr));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001163#endif
1164 in_dev_put(in_dev);
1165}
1166
1167static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1168{
1169 struct rtable *rt = (struct rtable*)dst;
1170 struct dst_entry *ret = dst;
1171
1172 if (rt) {
1173 if (dst->obsolete) {
1174 ip_rt_put(rt);
1175 ret = NULL;
1176 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1177 rt->u.dst.expires) {
Al Viro8c7bc842006-09-26 21:26:19 -07001178 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1179 rt->fl.oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001180#if RT_CACHE_DEBUG >= 1
1181 printk(KERN_DEBUG "ip_rt_advice: redirect to "
1182 "%u.%u.%u.%u/%02x dropped\n",
1183 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1184#endif
1185 rt_del(hash, rt);
1186 ret = NULL;
1187 }
1188 }
1189 return ret;
1190}
1191
1192/*
1193 * Algorithm:
1194 * 1. The first ip_rt_redirect_number redirects are sent
1195 * with exponential backoff, then we stop sending them at all,
1196 * assuming that the host ignores our redirects.
1197 * 2. If we did not see packets requiring redirects
1198 * during ip_rt_redirect_silence, we assume that the host
1199 * forgot redirected route and start to send redirects again.
1200 *
1201 * This algorithm is much cheaper and more intelligent than dumb load limiting
1202 * in icmp.c.
1203 *
1204 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1205 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1206 */
1207
1208void ip_rt_send_redirect(struct sk_buff *skb)
1209{
1210 struct rtable *rt = (struct rtable*)skb->dst;
1211 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1212
1213 if (!in_dev)
1214 return;
1215
1216 if (!IN_DEV_TX_REDIRECTS(in_dev))
1217 goto out;
1218
1219 /* No redirected packets during ip_rt_redirect_silence;
1220 * reset the algorithm.
1221 */
1222 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1223 rt->u.dst.rate_tokens = 0;
1224
1225 /* Too many ignored redirects; do not send anything
1226 * set u.dst.rate_last to the last seen redirected packet.
1227 */
1228 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1229 rt->u.dst.rate_last = jiffies;
1230 goto out;
1231 }
1232
1233 /* Check for load limit; set rate_last to the latest sent
1234 * redirect.
1235 */
Li Yewang14fb8a72006-12-18 00:26:35 -08001236 if (rt->u.dst.rate_tokens == 0 ||
1237 time_after(jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001238 (rt->u.dst.rate_last +
1239 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1240 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1241 rt->u.dst.rate_last = jiffies;
1242 ++rt->u.dst.rate_tokens;
1243#ifdef CONFIG_IP_ROUTE_VERBOSE
1244 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1245 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1246 net_ratelimit())
1247 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1248 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1249 NIPQUAD(rt->rt_src), rt->rt_iif,
1250 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1251#endif
1252 }
1253out:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001254 in_dev_put(in_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001255}
1256
1257static int ip_error(struct sk_buff *skb)
1258{
1259 struct rtable *rt = (struct rtable*)skb->dst;
1260 unsigned long now;
1261 int code;
1262
1263 switch (rt->u.dst.error) {
1264 case EINVAL:
1265 default:
1266 goto out;
1267 case EHOSTUNREACH:
1268 code = ICMP_HOST_UNREACH;
1269 break;
1270 case ENETUNREACH:
1271 code = ICMP_NET_UNREACH;
1272 break;
1273 case EACCES:
1274 code = ICMP_PKT_FILTERED;
1275 break;
1276 }
1277
1278 now = jiffies;
1279 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1280 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1281 rt->u.dst.rate_tokens = ip_rt_error_burst;
1282 rt->u.dst.rate_last = now;
1283 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1284 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1285 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1286 }
1287
1288out: kfree_skb(skb);
1289 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001290}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001291
1292/*
1293 * The last two values are not from the RFC but
1294 * are needed for AMPRnet AX.25 paths.
1295 */
1296
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001297static const unsigned short mtu_plateau[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -07001298{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1299
1300static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1301{
1302 int i;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001303
Linus Torvalds1da177e2005-04-16 15:20:36 -07001304 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1305 if (old_mtu > mtu_plateau[i])
1306 return mtu_plateau[i];
1307 return 68;
1308}
1309
1310unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1311{
1312 int i;
1313 unsigned short old_mtu = ntohs(iph->tot_len);
1314 struct rtable *rth;
Al Viroe4485152006-09-26 22:15:01 -07001315 __be32 skeys[2] = { iph->saddr, 0, };
1316 __be32 daddr = iph->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001317 unsigned short est_mtu = 0;
1318
1319 if (ipv4_config.no_pmtu_disc)
1320 return 0;
1321
1322 for (i = 0; i < 2; i++) {
Al Viro8c7bc842006-09-26 21:26:19 -07001323 unsigned hash = rt_hash(daddr, skeys[i], 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001324
1325 rcu_read_lock();
1326 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001327 rth = rcu_dereference(rth->u.dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001328 if (rth->fl.fl4_dst == daddr &&
1329 rth->fl.fl4_src == skeys[i] &&
1330 rth->rt_dst == daddr &&
1331 rth->rt_src == iph->saddr &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001332 rth->fl.iif == 0 &&
1333 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1334 unsigned short mtu = new_mtu;
1335
1336 if (new_mtu < 68 || new_mtu >= old_mtu) {
1337
1338 /* BSD 4.2 compatibility hack :-( */
1339 if (mtu == 0 &&
1340 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1341 old_mtu >= 68 + (iph->ihl << 2))
1342 old_mtu -= iph->ihl << 2;
1343
1344 mtu = guess_mtu(old_mtu);
1345 }
1346 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001347 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001348 dst_confirm(&rth->u.dst);
1349 if (mtu < ip_rt_min_pmtu) {
1350 mtu = ip_rt_min_pmtu;
1351 rth->u.dst.metrics[RTAX_LOCK-1] |=
1352 (1 << RTAX_MTU);
1353 }
1354 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1355 dst_set_expires(&rth->u.dst,
1356 ip_rt_mtu_expires);
1357 }
1358 est_mtu = mtu;
1359 }
1360 }
1361 }
1362 rcu_read_unlock();
1363 }
1364 return est_mtu ? : new_mtu;
1365}
1366
1367static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1368{
1369 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1370 !(dst_metric_locked(dst, RTAX_MTU))) {
1371 if (mtu < ip_rt_min_pmtu) {
1372 mtu = ip_rt_min_pmtu;
1373 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1374 }
1375 dst->metrics[RTAX_MTU-1] = mtu;
1376 dst_set_expires(dst, ip_rt_mtu_expires);
Tom Tucker8d717402006-07-30 20:43:36 -07001377 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001378 }
1379}
1380
1381static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1382{
1383 return NULL;
1384}
1385
1386static void ipv4_dst_destroy(struct dst_entry *dst)
1387{
1388 struct rtable *rt = (struct rtable *) dst;
1389 struct inet_peer *peer = rt->peer;
1390 struct in_device *idev = rt->idev;
1391
1392 if (peer) {
1393 rt->peer = NULL;
1394 inet_putpeer(peer);
1395 }
1396
1397 if (idev) {
1398 rt->idev = NULL;
1399 in_dev_put(idev);
1400 }
1401}
1402
1403static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1404 int how)
1405{
1406 struct rtable *rt = (struct rtable *) dst;
1407 struct in_device *idev = rt->idev;
1408 if (dev != &loopback_dev && idev && idev->dev == dev) {
1409 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1410 if (loopback_idev) {
1411 rt->idev = loopback_idev;
1412 in_dev_put(idev);
1413 }
1414 }
1415}
1416
1417static void ipv4_link_failure(struct sk_buff *skb)
1418{
1419 struct rtable *rt;
1420
1421 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1422
1423 rt = (struct rtable *) skb->dst;
1424 if (rt)
1425 dst_set_expires(&rt->u.dst, 0);
1426}
1427
1428static int ip_rt_bug(struct sk_buff *skb)
1429{
1430 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001431 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
Linus Torvalds1da177e2005-04-16 15:20:36 -07001432 skb->dev ? skb->dev->name : "?");
1433 kfree_skb(skb);
1434 return 0;
1435}
1436
1437/*
1438 We do not cache source address of outgoing interface,
1439 because it is used only by IP RR, TS and SRR options,
1440 so that it out of fast path.
1441
1442 BTW remember: "addr" is allowed to be not aligned
1443 in IP options!
1444 */
1445
1446void ip_rt_get_source(u8 *addr, struct rtable *rt)
1447{
Al Viroa61ced52006-09-26 21:27:54 -07001448 __be32 src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001449 struct fib_result res;
1450
1451 if (rt->fl.iif == 0)
1452 src = rt->rt_src;
1453 else if (fib_lookup(&rt->fl, &res) == 0) {
1454 src = FIB_RES_PREFSRC(res);
1455 fib_res_put(&res);
1456 } else
1457 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1458 RT_SCOPE_UNIVERSE);
1459 memcpy(addr, &src, 4);
1460}
1461
1462#ifdef CONFIG_NET_CLS_ROUTE
1463static void set_class_tag(struct rtable *rt, u32 tag)
1464{
1465 if (!(rt->u.dst.tclassid & 0xFFFF))
1466 rt->u.dst.tclassid |= tag & 0xFFFF;
1467 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1468 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1469}
1470#endif
1471
1472static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1473{
1474 struct fib_info *fi = res->fi;
1475
1476 if (fi) {
1477 if (FIB_RES_GW(*res) &&
1478 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1479 rt->rt_gateway = FIB_RES_GW(*res);
1480 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1481 sizeof(rt->u.dst.metrics));
1482 if (fi->fib_mtu == 0) {
1483 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1484 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1485 rt->rt_gateway != rt->rt_dst &&
1486 rt->u.dst.dev->mtu > 576)
1487 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1488 }
1489#ifdef CONFIG_NET_CLS_ROUTE
1490 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1491#endif
1492 } else
1493 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1494
1495 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1496 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1497 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1498 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1499 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1500 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1501 ip_rt_min_advmss);
1502 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1503 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1504
1505#ifdef CONFIG_NET_CLS_ROUTE
1506#ifdef CONFIG_IP_MULTIPLE_TABLES
1507 set_class_tag(rt, fib_rules_tclass(res));
1508#endif
1509 set_class_tag(rt, itag);
1510#endif
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001511 rt->rt_type = res->type;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001512}
1513
Al Viro9e12bb22006-09-26 21:25:20 -07001514static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001515 u8 tos, struct net_device *dev, int our)
1516{
1517 unsigned hash;
1518 struct rtable *rth;
Al Viroa61ced52006-09-26 21:27:54 -07001519 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001520 struct in_device *in_dev = in_dev_get(dev);
1521 u32 itag = 0;
1522
1523 /* Primary sanity checks. */
1524
1525 if (in_dev == NULL)
1526 return -EINVAL;
1527
1528 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1529 skb->protocol != htons(ETH_P_IP))
1530 goto e_inval;
1531
1532 if (ZERONET(saddr)) {
1533 if (!LOCAL_MCAST(daddr))
1534 goto e_inval;
1535 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1536 } else if (fib_validate_source(saddr, 0, tos, 0,
1537 dev, &spec_dst, &itag) < 0)
1538 goto e_inval;
1539
1540 rth = dst_alloc(&ipv4_dst_ops);
1541 if (!rth)
1542 goto e_nobufs;
1543
1544 rth->u.dst.output= ip_rt_bug;
1545
1546 atomic_set(&rth->u.dst.__refcnt, 1);
1547 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07001548 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001549 rth->u.dst.flags |= DST_NOPOLICY;
1550 rth->fl.fl4_dst = daddr;
1551 rth->rt_dst = daddr;
1552 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001553 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001554 rth->fl.fl4_src = saddr;
1555 rth->rt_src = saddr;
1556#ifdef CONFIG_NET_CLS_ROUTE
1557 rth->u.dst.tclassid = itag;
1558#endif
1559 rth->rt_iif =
1560 rth->fl.iif = dev->ifindex;
1561 rth->u.dst.dev = &loopback_dev;
1562 dev_hold(rth->u.dst.dev);
1563 rth->idev = in_dev_get(rth->u.dst.dev);
1564 rth->fl.oif = 0;
1565 rth->rt_gateway = daddr;
1566 rth->rt_spec_dst= spec_dst;
1567 rth->rt_type = RTN_MULTICAST;
1568 rth->rt_flags = RTCF_MULTICAST;
1569 if (our) {
1570 rth->u.dst.input= ip_local_deliver;
1571 rth->rt_flags |= RTCF_LOCAL;
1572 }
1573
1574#ifdef CONFIG_IP_MROUTE
1575 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1576 rth->u.dst.input = ip_mr_input;
1577#endif
1578 RT_CACHE_STAT_INC(in_slow_mc);
1579
1580 in_dev_put(in_dev);
Al Viro8c7bc842006-09-26 21:26:19 -07001581 hash = rt_hash(daddr, saddr, dev->ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001582 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1583
1584e_nobufs:
1585 in_dev_put(in_dev);
1586 return -ENOBUFS;
1587
1588e_inval:
1589 in_dev_put(in_dev);
1590 return -EINVAL;
1591}
1592
1593
1594static void ip_handle_martian_source(struct net_device *dev,
1595 struct in_device *in_dev,
1596 struct sk_buff *skb,
Al Viro9e12bb22006-09-26 21:25:20 -07001597 __be32 daddr,
1598 __be32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001599{
1600 RT_CACHE_STAT_INC(in_martian_src);
1601#ifdef CONFIG_IP_ROUTE_VERBOSE
1602 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1603 /*
1604 * RFC1812 recommendation, if source is martian,
1605 * the only hint is MAC header.
1606 */
1607 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1608 "%u.%u.%u.%u, on dev %s\n",
1609 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001610 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001611 int i;
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001612 const unsigned char *p = skb_mac_header(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001613 printk(KERN_WARNING "ll header: ");
1614 for (i = 0; i < dev->hard_header_len; i++, p++) {
1615 printk("%02x", *p);
1616 if (i < (dev->hard_header_len - 1))
1617 printk(":");
1618 }
1619 printk("\n");
1620 }
1621 }
1622#endif
1623}
1624
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001625static inline int __mkroute_input(struct sk_buff *skb,
1626 struct fib_result* res,
1627 struct in_device *in_dev,
Al Viro9e12bb22006-09-26 21:25:20 -07001628 __be32 daddr, __be32 saddr, u32 tos,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001629 struct rtable **result)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001630{
1631
1632 struct rtable *rth;
1633 int err;
1634 struct in_device *out_dev;
1635 unsigned flags = 0;
Al Virod9c9df82006-09-26 21:28:14 -07001636 __be32 spec_dst;
1637 u32 itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001638
1639 /* get a working reference to the output device */
1640 out_dev = in_dev_get(FIB_RES_DEV(*res));
1641 if (out_dev == NULL) {
1642 if (net_ratelimit())
1643 printk(KERN_CRIT "Bug in ip_route_input" \
1644 "_slow(). Please, report\n");
1645 return -EINVAL;
1646 }
1647
1648
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001649 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
Linus Torvalds1da177e2005-04-16 15:20:36 -07001650 in_dev->dev, &spec_dst, &itag);
1651 if (err < 0) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001652 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001653 saddr);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001654
Linus Torvalds1da177e2005-04-16 15:20:36 -07001655 err = -EINVAL;
1656 goto cleanup;
1657 }
1658
1659 if (err)
1660 flags |= RTCF_DIRECTSRC;
1661
1662 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1663 (IN_DEV_SHARED_MEDIA(out_dev) ||
1664 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1665 flags |= RTCF_DOREDIRECT;
1666
1667 if (skb->protocol != htons(ETH_P_IP)) {
1668 /* Not IP (i.e. ARP). Do not create route, if it is
1669 * invalid for proxy arp. DNAT routes are always valid.
1670 */
1671 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1672 err = -EINVAL;
1673 goto cleanup;
1674 }
1675 }
1676
1677
1678 rth = dst_alloc(&ipv4_dst_ops);
1679 if (!rth) {
1680 err = -ENOBUFS;
1681 goto cleanup;
1682 }
1683
Julian Anastasovce723d82005-09-08 13:34:47 -07001684 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001685 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07001686 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001687 rth->u.dst.flags |= DST_NOPOLICY;
Herbert Xu42f811b2007-06-04 23:34:44 -07001688 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001689 rth->u.dst.flags |= DST_NOXFRM;
1690 rth->fl.fl4_dst = daddr;
1691 rth->rt_dst = daddr;
1692 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001693 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001694 rth->fl.fl4_src = saddr;
1695 rth->rt_src = saddr;
1696 rth->rt_gateway = daddr;
1697 rth->rt_iif =
1698 rth->fl.iif = in_dev->dev->ifindex;
1699 rth->u.dst.dev = (out_dev)->dev;
1700 dev_hold(rth->u.dst.dev);
1701 rth->idev = in_dev_get(rth->u.dst.dev);
1702 rth->fl.oif = 0;
1703 rth->rt_spec_dst= spec_dst;
1704
1705 rth->u.dst.input = ip_forward;
1706 rth->u.dst.output = ip_output;
1707
1708 rt_set_nexthop(rth, res, itag);
1709
1710 rth->rt_flags = flags;
1711
1712 *result = rth;
1713 err = 0;
1714 cleanup:
1715 /* release the working reference to the output device */
1716 in_dev_put(out_dev);
1717 return err;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001718}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001719
David S. Millere06e7c62007-06-10 17:22:39 -07001720static inline int ip_mkroute_input(struct sk_buff *skb,
1721 struct fib_result* res,
1722 const struct flowi *fl,
1723 struct in_device *in_dev,
1724 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001725{
Chuck Short7abaa272005-06-22 22:10:23 -07001726 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001727 int err;
1728 unsigned hash;
1729
1730#ifdef CONFIG_IP_ROUTE_MULTIPATH
1731 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1732 fib_select_multipath(fl, res);
1733#endif
1734
1735 /* create a routing cache entry */
1736 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1737 if (err)
1738 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001739
1740 /* put it into the cache */
Al Viro8c7bc842006-09-26 21:26:19 -07001741 hash = rt_hash(daddr, saddr, fl->iif);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001742 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001743}
1744
Linus Torvalds1da177e2005-04-16 15:20:36 -07001745/*
1746 * NOTE. We drop all the packets that has local source
1747 * addresses, because every properly looped back packet
1748 * must have correct destination already attached by output routine.
1749 *
1750 * Such approach solves two big problems:
1751 * 1. Not simplex devices are handled properly.
1752 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1753 */
1754
Al Viro9e12bb22006-09-26 21:25:20 -07001755static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001756 u8 tos, struct net_device *dev)
1757{
1758 struct fib_result res;
1759 struct in_device *in_dev = in_dev_get(dev);
1760 struct flowi fl = { .nl_u = { .ip4_u =
1761 { .daddr = daddr,
1762 .saddr = saddr,
1763 .tos = tos,
1764 .scope = RT_SCOPE_UNIVERSE,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001765 } },
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001766 .mark = skb->mark,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001767 .iif = dev->ifindex };
1768 unsigned flags = 0;
1769 u32 itag = 0;
1770 struct rtable * rth;
1771 unsigned hash;
Al Viro9e12bb22006-09-26 21:25:20 -07001772 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001773 int err = -EINVAL;
1774 int free_res = 0;
1775
1776 /* IP on this device is disabled. */
1777
1778 if (!in_dev)
1779 goto out;
1780
1781 /* Check for the most weird martians, which can be not detected
1782 by fib_lookup.
1783 */
1784
1785 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1786 goto martian_source;
1787
Al Viroe4485152006-09-26 22:15:01 -07001788 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001789 goto brd_input;
1790
1791 /* Accept zero addresses only to limited broadcast;
1792 * I even do not know to fix it or not. Waiting for complains :-)
1793 */
1794 if (ZERONET(saddr))
1795 goto martian_source;
1796
1797 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1798 goto martian_destination;
1799
1800 /*
1801 * Now we are ready to route packet.
1802 */
1803 if ((err = fib_lookup(&fl, &res)) != 0) {
1804 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001805 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001806 goto no_route;
1807 }
1808 free_res = 1;
1809
1810 RT_CACHE_STAT_INC(in_slow_tot);
1811
1812 if (res.type == RTN_BROADCAST)
1813 goto brd_input;
1814
1815 if (res.type == RTN_LOCAL) {
1816 int result;
1817 result = fib_validate_source(saddr, daddr, tos,
1818 loopback_dev.ifindex,
1819 dev, &spec_dst, &itag);
1820 if (result < 0)
1821 goto martian_source;
1822 if (result)
1823 flags |= RTCF_DIRECTSRC;
1824 spec_dst = daddr;
1825 goto local_input;
1826 }
1827
1828 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001829 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001830 if (res.type != RTN_UNICAST)
1831 goto martian_destination;
1832
1833 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1834 if (err == -ENOBUFS)
1835 goto e_nobufs;
1836 if (err == -EINVAL)
1837 goto e_inval;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001838
Linus Torvalds1da177e2005-04-16 15:20:36 -07001839done:
1840 in_dev_put(in_dev);
1841 if (free_res)
1842 fib_res_put(&res);
1843out: return err;
1844
1845brd_input:
1846 if (skb->protocol != htons(ETH_P_IP))
1847 goto e_inval;
1848
1849 if (ZERONET(saddr))
1850 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1851 else {
1852 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1853 &itag);
1854 if (err < 0)
1855 goto martian_source;
1856 if (err)
1857 flags |= RTCF_DIRECTSRC;
1858 }
1859 flags |= RTCF_BROADCAST;
1860 res.type = RTN_BROADCAST;
1861 RT_CACHE_STAT_INC(in_brd);
1862
1863local_input:
1864 rth = dst_alloc(&ipv4_dst_ops);
1865 if (!rth)
1866 goto e_nobufs;
1867
1868 rth->u.dst.output= ip_rt_bug;
1869
1870 atomic_set(&rth->u.dst.__refcnt, 1);
1871 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07001872 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001873 rth->u.dst.flags |= DST_NOPOLICY;
1874 rth->fl.fl4_dst = daddr;
1875 rth->rt_dst = daddr;
1876 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001877 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001878 rth->fl.fl4_src = saddr;
1879 rth->rt_src = saddr;
1880#ifdef CONFIG_NET_CLS_ROUTE
1881 rth->u.dst.tclassid = itag;
1882#endif
1883 rth->rt_iif =
1884 rth->fl.iif = dev->ifindex;
1885 rth->u.dst.dev = &loopback_dev;
1886 dev_hold(rth->u.dst.dev);
1887 rth->idev = in_dev_get(rth->u.dst.dev);
1888 rth->rt_gateway = daddr;
1889 rth->rt_spec_dst= spec_dst;
1890 rth->u.dst.input= ip_local_deliver;
1891 rth->rt_flags = flags|RTCF_LOCAL;
1892 if (res.type == RTN_UNREACHABLE) {
1893 rth->u.dst.input= ip_error;
1894 rth->u.dst.error= -err;
1895 rth->rt_flags &= ~RTCF_LOCAL;
1896 }
1897 rth->rt_type = res.type;
Al Viro8c7bc842006-09-26 21:26:19 -07001898 hash = rt_hash(daddr, saddr, fl.iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001899 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1900 goto done;
1901
1902no_route:
1903 RT_CACHE_STAT_INC(in_no_route);
1904 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1905 res.type = RTN_UNREACHABLE;
1906 goto local_input;
1907
1908 /*
1909 * Do not cache martian addresses: they should be logged (RFC1812)
1910 */
1911martian_destination:
1912 RT_CACHE_STAT_INC(in_martian_dst);
1913#ifdef CONFIG_IP_ROUTE_VERBOSE
1914 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1915 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1916 "%u.%u.%u.%u, dev %s\n",
1917 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1918#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001919
1920e_hostunreach:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001921 err = -EHOSTUNREACH;
1922 goto done;
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001923
Linus Torvalds1da177e2005-04-16 15:20:36 -07001924e_inval:
1925 err = -EINVAL;
1926 goto done;
1927
1928e_nobufs:
1929 err = -ENOBUFS;
1930 goto done;
1931
1932martian_source:
1933 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1934 goto e_inval;
1935}
1936
Al Viro9e12bb22006-09-26 21:25:20 -07001937int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001938 u8 tos, struct net_device *dev)
1939{
1940 struct rtable * rth;
1941 unsigned hash;
1942 int iif = dev->ifindex;
1943
1944 tos &= IPTOS_RT_MASK;
Al Viro8c7bc842006-09-26 21:26:19 -07001945 hash = rt_hash(daddr, saddr, iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001946
1947 rcu_read_lock();
1948 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001949 rth = rcu_dereference(rth->u.dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001950 if (rth->fl.fl4_dst == daddr &&
1951 rth->fl.fl4_src == saddr &&
1952 rth->fl.iif == iif &&
1953 rth->fl.oif == 0 &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001954 rth->fl.mark == skb->mark &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001955 rth->fl.fl4_tos == tos) {
1956 rth->u.dst.lastuse = jiffies;
1957 dst_hold(&rth->u.dst);
1958 rth->u.dst.__use++;
1959 RT_CACHE_STAT_INC(in_hit);
1960 rcu_read_unlock();
1961 skb->dst = (struct dst_entry*)rth;
1962 return 0;
1963 }
1964 RT_CACHE_STAT_INC(in_hlist_search);
1965 }
1966 rcu_read_unlock();
1967
1968 /* Multicast recognition logic is moved from route cache to here.
1969 The problem was that too many Ethernet cards have broken/missing
1970 hardware multicast filters :-( As result the host on multicasting
1971 network acquires a lot of useless route cache entries, sort of
1972 SDR messages from all the world. Now we try to get rid of them.
1973 Really, provided software IP multicast filter is organized
1974 reasonably (at least, hashed), it does not result in a slowdown
1975 comparing with route cache reject entries.
1976 Note, that multicast routers are not affected, because
1977 route cache entry is created eventually.
1978 */
1979 if (MULTICAST(daddr)) {
1980 struct in_device *in_dev;
1981
1982 rcu_read_lock();
Herbert Xue5ed6392005-10-03 14:35:55 -07001983 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001984 int our = ip_check_mc(in_dev, daddr, saddr,
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001985 ip_hdr(skb)->protocol);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001986 if (our
1987#ifdef CONFIG_IP_MROUTE
1988 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1989#endif
1990 ) {
1991 rcu_read_unlock();
1992 return ip_route_input_mc(skb, daddr, saddr,
1993 tos, dev, our);
1994 }
1995 }
1996 rcu_read_unlock();
1997 return -EINVAL;
1998 }
1999 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2000}
2001
2002static inline int __mkroute_output(struct rtable **result,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002003 struct fib_result* res,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002004 const struct flowi *fl,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002005 const struct flowi *oldflp,
2006 struct net_device *dev_out,
2007 unsigned flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002008{
2009 struct rtable *rth;
2010 struct in_device *in_dev;
2011 u32 tos = RT_FL_TOS(oldflp);
2012 int err = 0;
2013
2014 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2015 return -EINVAL;
2016
Al Viroe4485152006-09-26 22:15:01 -07002017 if (fl->fl4_dst == htonl(0xFFFFFFFF))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002018 res->type = RTN_BROADCAST;
2019 else if (MULTICAST(fl->fl4_dst))
2020 res->type = RTN_MULTICAST;
2021 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2022 return -EINVAL;
2023
2024 if (dev_out->flags & IFF_LOOPBACK)
2025 flags |= RTCF_LOCAL;
2026
2027 /* get work reference to inet device */
2028 in_dev = in_dev_get(dev_out);
2029 if (!in_dev)
2030 return -EINVAL;
2031
2032 if (res->type == RTN_BROADCAST) {
2033 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2034 if (res->fi) {
2035 fib_info_put(res->fi);
2036 res->fi = NULL;
2037 }
2038 } else if (res->type == RTN_MULTICAST) {
2039 flags |= RTCF_MULTICAST|RTCF_LOCAL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002040 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002041 oldflp->proto))
2042 flags &= ~RTCF_LOCAL;
2043 /* If multicast route do not exist use
2044 default one, but do not gateway in this case.
2045 Yes, it is hack.
2046 */
2047 if (res->fi && res->prefixlen < 4) {
2048 fib_info_put(res->fi);
2049 res->fi = NULL;
2050 }
2051 }
2052
2053
2054 rth = dst_alloc(&ipv4_dst_ops);
2055 if (!rth) {
2056 err = -ENOBUFS;
2057 goto cleanup;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002058 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002059
Julian Anastasovce723d82005-09-08 13:34:47 -07002060 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002061 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07002062 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002063 rth->u.dst.flags |= DST_NOXFRM;
Herbert Xu42f811b2007-06-04 23:34:44 -07002064 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002065 rth->u.dst.flags |= DST_NOPOLICY;
2066
2067 rth->fl.fl4_dst = oldflp->fl4_dst;
2068 rth->fl.fl4_tos = tos;
2069 rth->fl.fl4_src = oldflp->fl4_src;
2070 rth->fl.oif = oldflp->oif;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002071 rth->fl.mark = oldflp->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002072 rth->rt_dst = fl->fl4_dst;
2073 rth->rt_src = fl->fl4_src;
2074 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002075 /* get references to the devices that are to be hold by the routing
Linus Torvalds1da177e2005-04-16 15:20:36 -07002076 cache entry */
2077 rth->u.dst.dev = dev_out;
2078 dev_hold(dev_out);
2079 rth->idev = in_dev_get(dev_out);
2080 rth->rt_gateway = fl->fl4_dst;
2081 rth->rt_spec_dst= fl->fl4_src;
2082
2083 rth->u.dst.output=ip_output;
2084
2085 RT_CACHE_STAT_INC(out_slow_tot);
2086
2087 if (flags & RTCF_LOCAL) {
2088 rth->u.dst.input = ip_local_deliver;
2089 rth->rt_spec_dst = fl->fl4_dst;
2090 }
2091 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2092 rth->rt_spec_dst = fl->fl4_src;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002093 if (flags & RTCF_LOCAL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002094 !(dev_out->flags & IFF_LOOPBACK)) {
2095 rth->u.dst.output = ip_mc_output;
2096 RT_CACHE_STAT_INC(out_slow_mc);
2097 }
2098#ifdef CONFIG_IP_MROUTE
2099 if (res->type == RTN_MULTICAST) {
2100 if (IN_DEV_MFORWARD(in_dev) &&
2101 !LOCAL_MCAST(oldflp->fl4_dst)) {
2102 rth->u.dst.input = ip_mr_input;
2103 rth->u.dst.output = ip_mc_output;
2104 }
2105 }
2106#endif
2107 }
2108
2109 rt_set_nexthop(rth, res, 0);
2110
2111 rth->rt_flags = flags;
2112
2113 *result = rth;
2114 cleanup:
2115 /* release work reference to inet device */
2116 in_dev_put(in_dev);
2117
2118 return err;
2119}
2120
David S. Millere06e7c62007-06-10 17:22:39 -07002121static inline int ip_mkroute_output(struct rtable **rp,
2122 struct fib_result* res,
2123 const struct flowi *fl,
2124 const struct flowi *oldflp,
2125 struct net_device *dev_out,
2126 unsigned flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002127{
Chuck Short7abaa272005-06-22 22:10:23 -07002128 struct rtable *rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002129 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2130 unsigned hash;
2131 if (err == 0) {
Al Viro8c7bc842006-09-26 21:26:19 -07002132 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002133 err = rt_intern_hash(hash, rth, rp);
2134 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002135
Linus Torvalds1da177e2005-04-16 15:20:36 -07002136 return err;
2137}
2138
Linus Torvalds1da177e2005-04-16 15:20:36 -07002139/*
2140 * Major route resolver routine.
2141 */
2142
2143static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2144{
2145 u32 tos = RT_FL_TOS(oldflp);
2146 struct flowi fl = { .nl_u = { .ip4_u =
2147 { .daddr = oldflp->fl4_dst,
2148 .saddr = oldflp->fl4_src,
2149 .tos = tos & IPTOS_RT_MASK,
2150 .scope = ((tos & RTO_ONLINK) ?
2151 RT_SCOPE_LINK :
2152 RT_SCOPE_UNIVERSE),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002153 } },
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002154 .mark = oldflp->mark,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002155 .iif = loopback_dev.ifindex,
2156 .oif = oldflp->oif };
2157 struct fib_result res;
2158 unsigned flags = 0;
2159 struct net_device *dev_out = NULL;
2160 int free_res = 0;
2161 int err;
2162
2163
2164 res.fi = NULL;
2165#ifdef CONFIG_IP_MULTIPLE_TABLES
2166 res.r = NULL;
2167#endif
2168
2169 if (oldflp->fl4_src) {
2170 err = -EINVAL;
2171 if (MULTICAST(oldflp->fl4_src) ||
2172 BADCLASS(oldflp->fl4_src) ||
2173 ZERONET(oldflp->fl4_src))
2174 goto out;
2175
2176 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2177 dev_out = ip_dev_find(oldflp->fl4_src);
David S. Millerf6c5d732007-05-18 02:07:50 -07002178 if (dev_out == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002179 goto out;
2180
2181 /* I removed check for oif == dev_out->oif here.
2182 It was wrong for two reasons:
2183 1. ip_dev_find(saddr) can return wrong iface, if saddr is
2184 assigned to multiple interfaces.
2185 2. Moreover, we are allowed to send packets with saddr
2186 of another iface. --ANK
2187 */
2188
David S. Millerf6c5d732007-05-18 02:07:50 -07002189 if (oldflp->oif == 0
Al Viroe4485152006-09-26 22:15:01 -07002190 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002191 /* Special hack: user can direct multicasts
2192 and limited broadcast via necessary interface
2193 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2194 This hack is not just for fun, it allows
2195 vic,vat and friends to work.
2196 They bind socket to loopback, set ttl to zero
2197 and expect that it will work.
2198 From the viewpoint of routing cache they are broken,
2199 because we are not allowed to build multicast path
2200 with loopback source addr (look, routing cache
2201 cannot know, that ttl is zero, so that packet
2202 will not leave this host and route is valid).
2203 Luckily, this hack is good workaround.
2204 */
2205
2206 fl.oif = dev_out->ifindex;
2207 goto make_route;
2208 }
2209 if (dev_out)
2210 dev_put(dev_out);
2211 dev_out = NULL;
2212 }
2213
2214
2215 if (oldflp->oif) {
Eric W. Biederman881d9662007-09-17 11:56:21 -07002216 dev_out = dev_get_by_index(&init_net, oldflp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002217 err = -ENODEV;
2218 if (dev_out == NULL)
2219 goto out;
Herbert Xue5ed6392005-10-03 14:35:55 -07002220
2221 /* RACE: Check return value of inet_select_addr instead. */
2222 if (__in_dev_get_rtnl(dev_out) == NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002223 dev_put(dev_out);
2224 goto out; /* Wrong error code */
2225 }
2226
Al Viroe4485152006-09-26 22:15:01 -07002227 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002228 if (!fl.fl4_src)
2229 fl.fl4_src = inet_select_addr(dev_out, 0,
2230 RT_SCOPE_LINK);
2231 goto make_route;
2232 }
2233 if (!fl.fl4_src) {
2234 if (MULTICAST(oldflp->fl4_dst))
2235 fl.fl4_src = inet_select_addr(dev_out, 0,
2236 fl.fl4_scope);
2237 else if (!oldflp->fl4_dst)
2238 fl.fl4_src = inet_select_addr(dev_out, 0,
2239 RT_SCOPE_HOST);
2240 }
2241 }
2242
2243 if (!fl.fl4_dst) {
2244 fl.fl4_dst = fl.fl4_src;
2245 if (!fl.fl4_dst)
2246 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2247 if (dev_out)
2248 dev_put(dev_out);
2249 dev_out = &loopback_dev;
2250 dev_hold(dev_out);
2251 fl.oif = loopback_dev.ifindex;
2252 res.type = RTN_LOCAL;
2253 flags |= RTCF_LOCAL;
2254 goto make_route;
2255 }
2256
2257 if (fib_lookup(&fl, &res)) {
2258 res.fi = NULL;
2259 if (oldflp->oif) {
2260 /* Apparently, routing tables are wrong. Assume,
2261 that the destination is on link.
2262
2263 WHY? DW.
2264 Because we are allowed to send to iface
2265 even if it has NO routes and NO assigned
2266 addresses. When oif is specified, routing
2267 tables are looked up with only one purpose:
2268 to catch if destination is gatewayed, rather than
2269 direct. Moreover, if MSG_DONTROUTE is set,
2270 we send packet, ignoring both routing tables
2271 and ifaddr state. --ANK
2272
2273
2274 We could make it even if oif is unknown,
2275 likely IPv6, but we do not.
2276 */
2277
2278 if (fl.fl4_src == 0)
2279 fl.fl4_src = inet_select_addr(dev_out, 0,
2280 RT_SCOPE_LINK);
2281 res.type = RTN_UNICAST;
2282 goto make_route;
2283 }
2284 if (dev_out)
2285 dev_put(dev_out);
2286 err = -ENETUNREACH;
2287 goto out;
2288 }
2289 free_res = 1;
2290
2291 if (res.type == RTN_LOCAL) {
2292 if (!fl.fl4_src)
2293 fl.fl4_src = fl.fl4_dst;
2294 if (dev_out)
2295 dev_put(dev_out);
2296 dev_out = &loopback_dev;
2297 dev_hold(dev_out);
2298 fl.oif = dev_out->ifindex;
2299 if (res.fi)
2300 fib_info_put(res.fi);
2301 res.fi = NULL;
2302 flags |= RTCF_LOCAL;
2303 goto make_route;
2304 }
2305
2306#ifdef CONFIG_IP_ROUTE_MULTIPATH
2307 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2308 fib_select_multipath(&fl, &res);
2309 else
2310#endif
2311 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2312 fib_select_default(&fl, &res);
2313
2314 if (!fl.fl4_src)
2315 fl.fl4_src = FIB_RES_PREFSRC(res);
2316
2317 if (dev_out)
2318 dev_put(dev_out);
2319 dev_out = FIB_RES_DEV(res);
2320 dev_hold(dev_out);
2321 fl.oif = dev_out->ifindex;
2322
2323
2324make_route:
2325 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2326
2327
2328 if (free_res)
2329 fib_res_put(&res);
2330 if (dev_out)
2331 dev_put(dev_out);
2332out: return err;
2333}
2334
2335int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2336{
2337 unsigned hash;
2338 struct rtable *rth;
2339
Al Viro8c7bc842006-09-26 21:26:19 -07002340 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002341
2342 rcu_read_lock_bh();
2343 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08002344 rth = rcu_dereference(rth->u.dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002345 if (rth->fl.fl4_dst == flp->fl4_dst &&
2346 rth->fl.fl4_src == flp->fl4_src &&
2347 rth->fl.iif == 0 &&
2348 rth->fl.oif == flp->oif &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002349 rth->fl.mark == flp->mark &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002350 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2351 (IPTOS_RT_MASK | RTO_ONLINK))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002352 rth->u.dst.lastuse = jiffies;
2353 dst_hold(&rth->u.dst);
2354 rth->u.dst.__use++;
2355 RT_CACHE_STAT_INC(out_hit);
2356 rcu_read_unlock_bh();
2357 *rp = rth;
2358 return 0;
2359 }
2360 RT_CACHE_STAT_INC(out_hlist_search);
2361 }
2362 rcu_read_unlock_bh();
2363
2364 return ip_route_output_slow(rp, flp);
2365}
2366
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002367EXPORT_SYMBOL_GPL(__ip_route_output_key);
2368
David S. Miller14e50e52007-05-24 18:17:54 -07002369static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2370{
2371}
2372
2373static struct dst_ops ipv4_dst_blackhole_ops = {
2374 .family = AF_INET,
2375 .protocol = __constant_htons(ETH_P_IP),
2376 .destroy = ipv4_dst_destroy,
2377 .check = ipv4_dst_check,
2378 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2379 .entry_size = sizeof(struct rtable),
2380};
2381
2382
2383static int ipv4_blackhole_output(struct sk_buff *skb)
2384{
2385 kfree_skb(skb);
2386 return 0;
2387}
2388
2389static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2390{
2391 struct rtable *ort = *rp;
2392 struct rtable *rt = (struct rtable *)
2393 dst_alloc(&ipv4_dst_blackhole_ops);
2394
2395 if (rt) {
2396 struct dst_entry *new = &rt->u.dst;
2397
2398 atomic_set(&new->__refcnt, 1);
2399 new->__use = 1;
2400 new->input = ipv4_blackhole_output;
2401 new->output = ipv4_blackhole_output;
2402 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2403
2404 new->dev = ort->u.dst.dev;
2405 if (new->dev)
2406 dev_hold(new->dev);
2407
2408 rt->fl = ort->fl;
2409
2410 rt->idev = ort->idev;
2411 if (rt->idev)
2412 in_dev_hold(rt->idev);
2413 rt->rt_flags = ort->rt_flags;
2414 rt->rt_type = ort->rt_type;
2415 rt->rt_dst = ort->rt_dst;
2416 rt->rt_src = ort->rt_src;
2417 rt->rt_iif = ort->rt_iif;
2418 rt->rt_gateway = ort->rt_gateway;
2419 rt->rt_spec_dst = ort->rt_spec_dst;
2420 rt->peer = ort->peer;
2421 if (rt->peer)
2422 atomic_inc(&rt->peer->refcnt);
2423
2424 dst_free(new);
2425 }
2426
2427 dst_release(&(*rp)->u.dst);
2428 *rp = rt;
2429 return (rt ? 0 : -ENOMEM);
2430}
2431
Linus Torvalds1da177e2005-04-16 15:20:36 -07002432int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2433{
2434 int err;
2435
2436 if ((err = __ip_route_output_key(rp, flp)) != 0)
2437 return err;
2438
2439 if (flp->proto) {
2440 if (!flp->fl4_src)
2441 flp->fl4_src = (*rp)->rt_src;
2442 if (!flp->fl4_dst)
2443 flp->fl4_dst = (*rp)->rt_dst;
David S. Miller14e50e52007-05-24 18:17:54 -07002444 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2445 if (err == -EREMOTE)
2446 err = ipv4_dst_blackhole(rp, flp, sk);
2447
2448 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002449 }
2450
2451 return 0;
2452}
2453
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002454EXPORT_SYMBOL_GPL(ip_route_output_flow);
2455
Linus Torvalds1da177e2005-04-16 15:20:36 -07002456int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2457{
2458 return ip_route_output_flow(rp, flp, NULL, 0);
2459}
2460
2461static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002462 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002463{
2464 struct rtable *rt = (struct rtable*)skb->dst;
2465 struct rtmsg *r;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002466 struct nlmsghdr *nlh;
Thomas Grafe3703b32006-11-27 09:27:07 -08002467 long expires;
2468 u32 id = 0, ts = 0, tsage = 0, error;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002469
2470 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2471 if (nlh == NULL)
Patrick McHardy26932562007-01-31 23:16:40 -08002472 return -EMSGSIZE;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002473
2474 r = nlmsg_data(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002475 r->rtm_family = AF_INET;
2476 r->rtm_dst_len = 32;
2477 r->rtm_src_len = 0;
2478 r->rtm_tos = rt->fl.fl4_tos;
2479 r->rtm_table = RT_TABLE_MAIN;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002480 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002481 r->rtm_type = rt->rt_type;
2482 r->rtm_scope = RT_SCOPE_UNIVERSE;
2483 r->rtm_protocol = RTPROT_UNSPEC;
2484 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2485 if (rt->rt_flags & RTCF_NOTIFY)
2486 r->rtm_flags |= RTM_F_NOTIFY;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002487
Al Viro17fb2c62006-09-26 22:15:25 -07002488 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002489
Linus Torvalds1da177e2005-04-16 15:20:36 -07002490 if (rt->fl.fl4_src) {
2491 r->rtm_src_len = 32;
Al Viro17fb2c62006-09-26 22:15:25 -07002492 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002493 }
2494 if (rt->u.dst.dev)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002495 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002496#ifdef CONFIG_NET_CLS_ROUTE
2497 if (rt->u.dst.tclassid)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002498 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002499#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002500 if (rt->fl.iif)
Al Viro17fb2c62006-09-26 22:15:25 -07002501 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002502 else if (rt->rt_src != rt->fl.fl4_src)
Al Viro17fb2c62006-09-26 22:15:25 -07002503 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002504
Linus Torvalds1da177e2005-04-16 15:20:36 -07002505 if (rt->rt_dst != rt->rt_gateway)
Al Viro17fb2c62006-09-26 22:15:25 -07002506 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002507
Linus Torvalds1da177e2005-04-16 15:20:36 -07002508 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002509 goto nla_put_failure;
2510
Thomas Grafe3703b32006-11-27 09:27:07 -08002511 error = rt->u.dst.error;
2512 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002513 if (rt->peer) {
Thomas Grafe3703b32006-11-27 09:27:07 -08002514 id = rt->peer->ip_id_count;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002515 if (rt->peer->tcp_ts_stamp) {
Thomas Grafe3703b32006-11-27 09:27:07 -08002516 ts = rt->peer->tcp_ts;
James Morris9d729f72007-03-04 16:12:44 -08002517 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002518 }
2519 }
Thomas Grafbe403ea2006-08-17 18:15:17 -07002520
Linus Torvalds1da177e2005-04-16 15:20:36 -07002521 if (rt->fl.iif) {
2522#ifdef CONFIG_IP_MROUTE
Al Viroe4485152006-09-26 22:15:01 -07002523 __be32 dst = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002524
2525 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
Herbert Xu42f811b2007-06-04 23:34:44 -07002526 IPV4_DEVCONF_ALL(MC_FORWARDING)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002527 int err = ipmr_get_route(skb, r, nowait);
2528 if (err <= 0) {
2529 if (!nowait) {
2530 if (err == 0)
2531 return 0;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002532 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002533 } else {
2534 if (err == -EMSGSIZE)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002535 goto nla_put_failure;
Thomas Grafe3703b32006-11-27 09:27:07 -08002536 error = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002537 }
2538 }
2539 } else
2540#endif
Thomas Grafbe403ea2006-08-17 18:15:17 -07002541 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002542 }
2543
Thomas Grafe3703b32006-11-27 09:27:07 -08002544 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2545 expires, error) < 0)
2546 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002547
Thomas Grafbe403ea2006-08-17 18:15:17 -07002548 return nlmsg_end(skb, nlh);
2549
2550nla_put_failure:
Patrick McHardy26932562007-01-31 23:16:40 -08002551 nlmsg_cancel(skb, nlh);
2552 return -EMSGSIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002553}
2554
Thomas Graf63f34442007-03-22 11:55:17 -07002555static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002556{
Thomas Grafd889ce32006-08-17 18:15:44 -07002557 struct rtmsg *rtm;
2558 struct nlattr *tb[RTA_MAX+1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07002559 struct rtable *rt = NULL;
Al Viro9e12bb22006-09-26 21:25:20 -07002560 __be32 dst = 0;
2561 __be32 src = 0;
2562 u32 iif;
Thomas Grafd889ce32006-08-17 18:15:44 -07002563 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002564 struct sk_buff *skb;
2565
Thomas Grafd889ce32006-08-17 18:15:44 -07002566 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2567 if (err < 0)
2568 goto errout;
2569
2570 rtm = nlmsg_data(nlh);
2571
Linus Torvalds1da177e2005-04-16 15:20:36 -07002572 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
Thomas Grafd889ce32006-08-17 18:15:44 -07002573 if (skb == NULL) {
2574 err = -ENOBUFS;
2575 goto errout;
2576 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002577
2578 /* Reserve room for dummy headers, this skb can pass
2579 through good chunk of routing engine.
2580 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07002581 skb_reset_mac_header(skb);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07002582 skb_reset_network_header(skb);
Stephen Hemmingerd2c962b2006-04-17 17:27:11 -07002583
2584 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07002585 ip_hdr(skb)->protocol = IPPROTO_ICMP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002586 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2587
Al Viro17fb2c62006-09-26 22:15:25 -07002588 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2589 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
Thomas Grafd889ce32006-08-17 18:15:44 -07002590 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002591
2592 if (iif) {
Thomas Grafd889ce32006-08-17 18:15:44 -07002593 struct net_device *dev;
2594
Eric W. Biederman881d9662007-09-17 11:56:21 -07002595 dev = __dev_get_by_index(&init_net, iif);
Thomas Grafd889ce32006-08-17 18:15:44 -07002596 if (dev == NULL) {
2597 err = -ENODEV;
2598 goto errout_free;
2599 }
2600
Linus Torvalds1da177e2005-04-16 15:20:36 -07002601 skb->protocol = htons(ETH_P_IP);
2602 skb->dev = dev;
2603 local_bh_disable();
2604 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2605 local_bh_enable();
Thomas Grafd889ce32006-08-17 18:15:44 -07002606
2607 rt = (struct rtable*) skb->dst;
2608 if (err == 0 && rt->u.dst.error)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002609 err = -rt->u.dst.error;
2610 } else {
Thomas Grafd889ce32006-08-17 18:15:44 -07002611 struct flowi fl = {
2612 .nl_u = {
2613 .ip4_u = {
2614 .daddr = dst,
2615 .saddr = src,
2616 .tos = rtm->rtm_tos,
2617 },
2618 },
2619 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2620 };
Linus Torvalds1da177e2005-04-16 15:20:36 -07002621 err = ip_route_output_key(&rt, &fl);
2622 }
Thomas Grafd889ce32006-08-17 18:15:44 -07002623
Linus Torvalds1da177e2005-04-16 15:20:36 -07002624 if (err)
Thomas Grafd889ce32006-08-17 18:15:44 -07002625 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002626
2627 skb->dst = &rt->u.dst;
2628 if (rtm->rtm_flags & RTM_F_NOTIFY)
2629 rt->rt_flags |= RTCF_NOTIFY;
2630
Linus Torvalds1da177e2005-04-16 15:20:36 -07002631 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002632 RTM_NEWROUTE, 0, 0);
Thomas Grafd889ce32006-08-17 18:15:44 -07002633 if (err <= 0)
2634 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002635
Thomas Graf2942e902006-08-15 00:30:25 -07002636 err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
Thomas Grafd889ce32006-08-17 18:15:44 -07002637errout:
Thomas Graf2942e902006-08-15 00:30:25 -07002638 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002639
Thomas Grafd889ce32006-08-17 18:15:44 -07002640errout_free:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002641 kfree_skb(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07002642 goto errout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002643}
2644
2645int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2646{
2647 struct rtable *rt;
2648 int h, s_h;
2649 int idx, s_idx;
2650
2651 s_h = cb->args[0];
2652 s_idx = idx = cb->args[1];
2653 for (h = 0; h <= rt_hash_mask; h++) {
2654 if (h < s_h) continue;
2655 if (h > s_h)
2656 s_idx = 0;
2657 rcu_read_lock_bh();
2658 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08002659 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002660 if (idx < s_idx)
2661 continue;
2662 skb->dst = dst_clone(&rt->u.dst);
2663 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002664 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002665 1, NLM_F_MULTI) <= 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002666 dst_release(xchg(&skb->dst, NULL));
2667 rcu_read_unlock_bh();
2668 goto done;
2669 }
2670 dst_release(xchg(&skb->dst, NULL));
2671 }
2672 rcu_read_unlock_bh();
2673 }
2674
2675done:
2676 cb->args[0] = h;
2677 cb->args[1] = idx;
2678 return skb->len;
2679}
2680
2681void ip_rt_multicast_event(struct in_device *in_dev)
2682{
2683 rt_cache_flush(0);
2684}
2685
2686#ifdef CONFIG_SYSCTL
2687static int flush_delay;
2688
2689static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2690 struct file *filp, void __user *buffer,
2691 size_t *lenp, loff_t *ppos)
2692{
2693 if (write) {
2694 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2695 rt_cache_flush(flush_delay);
2696 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002697 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002698
2699 return -EINVAL;
2700}
2701
2702static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2703 int __user *name,
2704 int nlen,
2705 void __user *oldval,
2706 size_t __user *oldlenp,
2707 void __user *newval,
Alexey Dobriyan1f29bcd2006-12-10 02:19:10 -08002708 size_t newlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002709{
2710 int delay;
2711 if (newlen != sizeof(int))
2712 return -EINVAL;
2713 if (get_user(delay, (int __user *)newval))
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002714 return -EFAULT;
2715 rt_cache_flush(delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002716 return 0;
2717}
2718
2719ctl_table ipv4_route_table[] = {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002720 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002721 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2722 .procname = "flush",
2723 .data = &flush_delay,
2724 .maxlen = sizeof(int),
Dave Jones7e3e0362005-04-28 12:11:03 -07002725 .mode = 0200,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002726 .proc_handler = &ipv4_sysctl_rtcache_flush,
2727 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2728 },
2729 {
2730 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2731 .procname = "min_delay",
2732 .data = &ip_rt_min_delay,
2733 .maxlen = sizeof(int),
2734 .mode = 0644,
2735 .proc_handler = &proc_dointvec_jiffies,
2736 .strategy = &sysctl_jiffies,
2737 },
2738 {
2739 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2740 .procname = "max_delay",
2741 .data = &ip_rt_max_delay,
2742 .maxlen = sizeof(int),
2743 .mode = 0644,
2744 .proc_handler = &proc_dointvec_jiffies,
2745 .strategy = &sysctl_jiffies,
2746 },
2747 {
2748 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2749 .procname = "gc_thresh",
2750 .data = &ipv4_dst_ops.gc_thresh,
2751 .maxlen = sizeof(int),
2752 .mode = 0644,
2753 .proc_handler = &proc_dointvec,
2754 },
2755 {
2756 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2757 .procname = "max_size",
2758 .data = &ip_rt_max_size,
2759 .maxlen = sizeof(int),
2760 .mode = 0644,
2761 .proc_handler = &proc_dointvec,
2762 },
2763 {
2764 /* Deprecated. Use gc_min_interval_ms */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002765
Linus Torvalds1da177e2005-04-16 15:20:36 -07002766 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2767 .procname = "gc_min_interval",
2768 .data = &ip_rt_gc_min_interval,
2769 .maxlen = sizeof(int),
2770 .mode = 0644,
2771 .proc_handler = &proc_dointvec_jiffies,
2772 .strategy = &sysctl_jiffies,
2773 },
2774 {
2775 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2776 .procname = "gc_min_interval_ms",
2777 .data = &ip_rt_gc_min_interval,
2778 .maxlen = sizeof(int),
2779 .mode = 0644,
2780 .proc_handler = &proc_dointvec_ms_jiffies,
2781 .strategy = &sysctl_ms_jiffies,
2782 },
2783 {
2784 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2785 .procname = "gc_timeout",
2786 .data = &ip_rt_gc_timeout,
2787 .maxlen = sizeof(int),
2788 .mode = 0644,
2789 .proc_handler = &proc_dointvec_jiffies,
2790 .strategy = &sysctl_jiffies,
2791 },
2792 {
2793 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2794 .procname = "gc_interval",
2795 .data = &ip_rt_gc_interval,
2796 .maxlen = sizeof(int),
2797 .mode = 0644,
2798 .proc_handler = &proc_dointvec_jiffies,
2799 .strategy = &sysctl_jiffies,
2800 },
2801 {
2802 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2803 .procname = "redirect_load",
2804 .data = &ip_rt_redirect_load,
2805 .maxlen = sizeof(int),
2806 .mode = 0644,
2807 .proc_handler = &proc_dointvec,
2808 },
2809 {
2810 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2811 .procname = "redirect_number",
2812 .data = &ip_rt_redirect_number,
2813 .maxlen = sizeof(int),
2814 .mode = 0644,
2815 .proc_handler = &proc_dointvec,
2816 },
2817 {
2818 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2819 .procname = "redirect_silence",
2820 .data = &ip_rt_redirect_silence,
2821 .maxlen = sizeof(int),
2822 .mode = 0644,
2823 .proc_handler = &proc_dointvec,
2824 },
2825 {
2826 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2827 .procname = "error_cost",
2828 .data = &ip_rt_error_cost,
2829 .maxlen = sizeof(int),
2830 .mode = 0644,
2831 .proc_handler = &proc_dointvec,
2832 },
2833 {
2834 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2835 .procname = "error_burst",
2836 .data = &ip_rt_error_burst,
2837 .maxlen = sizeof(int),
2838 .mode = 0644,
2839 .proc_handler = &proc_dointvec,
2840 },
2841 {
2842 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
2843 .procname = "gc_elasticity",
2844 .data = &ip_rt_gc_elasticity,
2845 .maxlen = sizeof(int),
2846 .mode = 0644,
2847 .proc_handler = &proc_dointvec,
2848 },
2849 {
2850 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
2851 .procname = "mtu_expires",
2852 .data = &ip_rt_mtu_expires,
2853 .maxlen = sizeof(int),
2854 .mode = 0644,
2855 .proc_handler = &proc_dointvec_jiffies,
2856 .strategy = &sysctl_jiffies,
2857 },
2858 {
2859 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
2860 .procname = "min_pmtu",
2861 .data = &ip_rt_min_pmtu,
2862 .maxlen = sizeof(int),
2863 .mode = 0644,
2864 .proc_handler = &proc_dointvec,
2865 },
2866 {
2867 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
2868 .procname = "min_adv_mss",
2869 .data = &ip_rt_min_advmss,
2870 .maxlen = sizeof(int),
2871 .mode = 0644,
2872 .proc_handler = &proc_dointvec,
2873 },
2874 {
2875 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
2876 .procname = "secret_interval",
2877 .data = &ip_rt_secret_interval,
2878 .maxlen = sizeof(int),
2879 .mode = 0644,
2880 .proc_handler = &proc_dointvec_jiffies,
2881 .strategy = &sysctl_jiffies,
2882 },
2883 { .ctl_name = 0 }
2884};
2885#endif
2886
2887#ifdef CONFIG_NET_CLS_ROUTE
2888struct ip_rt_acct *ip_rt_acct;
2889
2890/* This code sucks. But you should have seen it before! --RR */
2891
2892/* IP route accounting ptr for this logical cpu number. */
2893#define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
2894
2895#ifdef CONFIG_PROC_FS
2896static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2897 int length, int *eof, void *data)
2898{
2899 unsigned int i;
2900
2901 if ((offset & 3) || (length & 3))
2902 return -EIO;
2903
2904 if (offset >= sizeof(struct ip_rt_acct) * 256) {
2905 *eof = 1;
2906 return 0;
2907 }
2908
2909 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2910 length = sizeof(struct ip_rt_acct) * 256 - offset;
2911 *eof = 1;
2912 }
2913
2914 offset /= sizeof(u32);
2915
2916 if (length > 0) {
2917 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
2918 u32 *dst = (u32 *) buffer;
2919
2920 /* Copy first cpu. */
2921 *start = buffer;
2922 memcpy(dst, src, length);
2923
2924 /* Add the other cpus in, one int at a time */
KAMEZAWA Hiroyuki6f912042006-04-10 22:52:50 -07002925 for_each_possible_cpu(i) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002926 unsigned int j;
2927
2928 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2929
2930 for (j = 0; j < length/4; j++)
2931 dst[j] += src[j];
2932 }
2933 }
2934 return length;
2935}
2936#endif /* CONFIG_PROC_FS */
2937#endif /* CONFIG_NET_CLS_ROUTE */
2938
2939static __initdata unsigned long rhash_entries;
2940static int __init set_rhash_entries(char *str)
2941{
2942 if (!str)
2943 return 0;
2944 rhash_entries = simple_strtoul(str, &str, 0);
2945 return 1;
2946}
2947__setup("rhash_entries=", set_rhash_entries);
2948
2949int __init ip_rt_init(void)
2950{
Eric Dumazet424c4b72005-07-05 14:58:19 -07002951 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002952
2953 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2954 (jiffies ^ (jiffies >> 7)));
2955
2956#ifdef CONFIG_NET_CLS_ROUTE
Eric Dumazet424c4b72005-07-05 14:58:19 -07002957 {
2958 int order;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002959 for (order = 0;
2960 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2961 /* NOTHING */;
2962 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2963 if (!ip_rt_acct)
2964 panic("IP: failed to allocate ip_rt_acct\n");
2965 memset(ip_rt_acct, 0, PAGE_SIZE << order);
Eric Dumazet424c4b72005-07-05 14:58:19 -07002966 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002967#endif
2968
Alexey Dobriyane5d679f332006-08-26 19:25:52 -07002969 ipv4_dst_ops.kmem_cachep =
2970 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09002971 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002972
David S. Miller14e50e52007-05-24 18:17:54 -07002973 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2974
Eric Dumazet424c4b72005-07-05 14:58:19 -07002975 rt_hash_table = (struct rt_hash_bucket *)
2976 alloc_large_system_hash("IP route cache",
2977 sizeof(struct rt_hash_bucket),
2978 rhash_entries,
2979 (num_physpages >= 128 * 1024) ?
Mike Stroyan18955cf2005-11-29 16:12:55 -08002980 15 : 17,
Kirill Korotaev8d1502d2006-08-07 20:44:22 -07002981 0,
Eric Dumazet424c4b72005-07-05 14:58:19 -07002982 &rt_hash_log,
2983 &rt_hash_mask,
2984 0);
Eric Dumazet22c047c2005-07-05 14:55:24 -07002985 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
2986 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002987
2988 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2989 ip_rt_max_size = (rt_hash_mask + 1) * 16;
2990
Linus Torvalds1da177e2005-04-16 15:20:36 -07002991 devinet_init();
2992 ip_fib_init();
2993
2994 init_timer(&rt_flush_timer);
2995 rt_flush_timer.function = rt_run_flush;
2996 init_timer(&rt_periodic_timer);
2997 rt_periodic_timer.function = rt_check_expire;
2998 init_timer(&rt_secret_timer);
2999 rt_secret_timer.function = rt_secret_rebuild;
3000
3001 /* All the timers, started at system startup tend
3002 to synchronize. Perturb it a bit.
3003 */
3004 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3005 ip_rt_gc_interval;
3006 add_timer(&rt_periodic_timer);
3007
3008 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3009 ip_rt_secret_interval;
3010 add_timer(&rt_secret_timer);
3011
3012#ifdef CONFIG_PROC_FS
3013 {
3014 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02003015 if (!proc_net_fops_create(&init_net, "rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003016 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02003017 init_net.proc_net_stat))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003018 return -ENOMEM;
3019 }
3020 rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3021 }
3022#ifdef CONFIG_NET_CLS_ROUTE
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02003023 create_proc_read_entry("rt_acct", 0, init_net.proc_net, ip_rt_acct_read, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003024#endif
3025#endif
3026#ifdef CONFIG_XFRM
3027 xfrm_init();
3028 xfrm4_init();
3029#endif
Thomas Graf63f34442007-03-22 11:55:17 -07003030 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3031
Linus Torvalds1da177e2005-04-16 15:20:36 -07003032 return rc;
3033}
3034
3035EXPORT_SYMBOL(__ip_select_ident);
3036EXPORT_SYMBOL(ip_route_input);
3037EXPORT_SYMBOL(ip_route_output_key);