blob: 21b12de9e6536663d5da2b60bd1d94b1d984acdd [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9 *
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 *
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090023 * Alan Cox : Super /proc >4K
Linus Torvalds1da177e2005-04-16 15:20:36 -070024 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090041 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070042 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070057 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Ilia Sotnikovcef26852006-03-25 01:38:55 -080058 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
59 * Ilia Sotnikov : Removed TOS from hash calculations
Linus Torvalds1da177e2005-04-16 15:20:36 -070060 *
61 * This program is free software; you can redistribute it and/or
62 * modify it under the terms of the GNU General Public License
63 * as published by the Free Software Foundation; either version
64 * 2 of the License, or (at your option) any later version.
65 */
66
Linus Torvalds1da177e2005-04-16 15:20:36 -070067#include <linux/module.h>
68#include <asm/uaccess.h>
69#include <asm/system.h>
70#include <linux/bitops.h>
71#include <linux/types.h>
72#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070073#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070074#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070075#include <linux/string.h>
76#include <linux/socket.h>
77#include <linux/sockios.h>
78#include <linux/errno.h>
79#include <linux/in.h>
80#include <linux/inet.h>
81#include <linux/netdevice.h>
82#include <linux/proc_fs.h>
83#include <linux/init.h>
Eric Dumazet39c90ec2007-09-15 10:55:54 -070084#include <linux/workqueue.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070085#include <linux/skbuff.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070086#include <linux/inetdevice.h>
87#include <linux/igmp.h>
88#include <linux/pkt_sched.h>
89#include <linux/mroute.h>
90#include <linux/netfilter_ipv4.h>
91#include <linux/random.h>
92#include <linux/jhash.h>
93#include <linux/rcupdate.h>
94#include <linux/times.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020095#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070096#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
Tom Tucker8d717402006-07-30 20:43:36 -0700106#include <net/netevent.h>
Thomas Graf63f34442007-03-22 11:55:17 -0700107#include <net/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108#ifdef CONFIG_SYSCTL
109#include <linux/sysctl.h>
110#endif
111
112#define RT_FL_TOS(oldflp) \
113 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114
115#define IP_MAX_MTU 0xFFF0
116
117#define RT_GC_TIMEOUT (300*HZ)
118
119static int ip_rt_min_delay = 2 * HZ;
120static int ip_rt_max_delay = 10 * HZ;
121static int ip_rt_max_size;
122static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
123static int ip_rt_gc_interval = 60 * HZ;
124static int ip_rt_gc_min_interval = HZ / 2;
125static int ip_rt_redirect_number = 9;
126static int ip_rt_redirect_load = HZ / 50;
127static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
128static int ip_rt_error_cost = HZ;
129static int ip_rt_error_burst = 5 * HZ;
130static int ip_rt_gc_elasticity = 8;
131static int ip_rt_mtu_expires = 10 * 60 * HZ;
132static int ip_rt_min_pmtu = 512 + 20 + 20;
133static int ip_rt_min_advmss = 256;
134static int ip_rt_secret_interval = 10 * 60 * HZ;
135static unsigned long rt_deadline;
136
137#define RTprint(a...) printk(KERN_DEBUG a)
138
139static struct timer_list rt_flush_timer;
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700140static void rt_check_expire(struct work_struct *work);
141static DECLARE_DELAYED_WORK(expires_work, rt_check_expire);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700142static struct timer_list rt_secret_timer;
143
144/*
145 * Interface to generic destination cache.
146 */
147
148static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
149static void ipv4_dst_destroy(struct dst_entry *dst);
150static void ipv4_dst_ifdown(struct dst_entry *dst,
151 struct net_device *dev, int how);
152static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
153static void ipv4_link_failure(struct sk_buff *skb);
154static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
155static int rt_garbage_collect(void);
156
157
158static struct dst_ops ipv4_dst_ops = {
159 .family = AF_INET,
160 .protocol = __constant_htons(ETH_P_IP),
161 .gc = rt_garbage_collect,
162 .check = ipv4_dst_check,
163 .destroy = ipv4_dst_destroy,
164 .ifdown = ipv4_dst_ifdown,
165 .negative_advice = ipv4_negative_advice,
166 .link_failure = ipv4_link_failure,
167 .update_pmtu = ip_rt_update_pmtu,
168 .entry_size = sizeof(struct rtable),
169};
170
171#define ECN_OR_COST(class) TC_PRIO_##class
172
Philippe De Muyter4839c522007-07-09 15:32:57 -0700173const __u8 ip_tos2prio[16] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700174 TC_PRIO_BESTEFFORT,
175 ECN_OR_COST(FILLER),
176 TC_PRIO_BESTEFFORT,
177 ECN_OR_COST(BESTEFFORT),
178 TC_PRIO_BULK,
179 ECN_OR_COST(BULK),
180 TC_PRIO_BULK,
181 ECN_OR_COST(BULK),
182 TC_PRIO_INTERACTIVE,
183 ECN_OR_COST(INTERACTIVE),
184 TC_PRIO_INTERACTIVE,
185 ECN_OR_COST(INTERACTIVE),
186 TC_PRIO_INTERACTIVE_BULK,
187 ECN_OR_COST(INTERACTIVE_BULK),
188 TC_PRIO_INTERACTIVE_BULK,
189 ECN_OR_COST(INTERACTIVE_BULK)
190};
191
192
193/*
194 * Route cache.
195 */
196
197/* The locking scheme is rather straight forward:
198 *
199 * 1) Read-Copy Update protects the buckets of the central route hash.
200 * 2) Only writers remove entries, and they hold the lock
201 * as they look at rtable reference counts.
202 * 3) Only readers acquire references to rtable entries,
203 * they do so with atomic increments and with the
204 * lock held.
205 */
206
207struct rt_hash_bucket {
208 struct rtable *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700209};
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700210#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
211 defined(CONFIG_PROVE_LOCKING)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700212/*
213 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
214 * The size of this table is a power of two and depends on the number of CPUS.
Ingo Molnar62051202006-07-03 00:24:59 -0700215 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700216 */
Ingo Molnar62051202006-07-03 00:24:59 -0700217#ifdef CONFIG_LOCKDEP
218# define RT_HASH_LOCK_SZ 256
Eric Dumazet22c047c2005-07-05 14:55:24 -0700219#else
Ingo Molnar62051202006-07-03 00:24:59 -0700220# if NR_CPUS >= 32
221# define RT_HASH_LOCK_SZ 4096
222# elif NR_CPUS >= 16
223# define RT_HASH_LOCK_SZ 2048
224# elif NR_CPUS >= 8
225# define RT_HASH_LOCK_SZ 1024
226# elif NR_CPUS >= 4
227# define RT_HASH_LOCK_SZ 512
228# else
229# define RT_HASH_LOCK_SZ 256
230# endif
Eric Dumazet22c047c2005-07-05 14:55:24 -0700231#endif
232
233static spinlock_t *rt_hash_locks;
234# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
235# define rt_hash_lock_init() { \
236 int i; \
237 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
238 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
239 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
240 spin_lock_init(&rt_hash_locks[i]); \
241 }
242#else
243# define rt_hash_lock_addr(slot) NULL
244# define rt_hash_lock_init()
245#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700246
247static struct rt_hash_bucket *rt_hash_table;
248static unsigned rt_hash_mask;
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700249static unsigned int rt_hash_log;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700250static unsigned int rt_hash_rnd;
251
Eric Dumazet2f970d82006-01-17 02:54:36 -0800252static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
Andrew Mortondbd29152006-01-17 21:58:01 -0800253#define RT_CACHE_STAT_INC(field) \
Paul Mackerrasbfe5d832006-06-25 05:47:14 -0700254 (__raw_get_cpu_var(rt_cache_stat).field++)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700255
256static int rt_intern_hash(unsigned hash, struct rtable *rth,
257 struct rtable **res);
258
Ilia Sotnikovcef26852006-03-25 01:38:55 -0800259static unsigned int rt_hash_code(u32 daddr, u32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700260{
Ilia Sotnikovcef26852006-03-25 01:38:55 -0800261 return (jhash_2words(daddr, saddr, rt_hash_rnd)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700262 & rt_hash_mask);
263}
264
Al Viro8c7bc842006-09-26 21:26:19 -0700265#define rt_hash(daddr, saddr, idx) \
266 rt_hash_code((__force u32)(__be32)(daddr),\
267 (__force u32)(__be32)(saddr) ^ ((idx) << 5))
268
Linus Torvalds1da177e2005-04-16 15:20:36 -0700269#ifdef CONFIG_PROC_FS
270struct rt_cache_iter_state {
271 int bucket;
272};
273
274static struct rtable *rt_cache_get_first(struct seq_file *seq)
275{
276 struct rtable *r = NULL;
277 struct rt_cache_iter_state *st = seq->private;
278
279 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
280 rcu_read_lock_bh();
281 r = rt_hash_table[st->bucket].chain;
282 if (r)
283 break;
284 rcu_read_unlock_bh();
285 }
286 return r;
287}
288
289static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
290{
291 struct rt_cache_iter_state *st = rcu_dereference(seq->private);
292
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800293 r = r->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700294 while (!r) {
295 rcu_read_unlock_bh();
296 if (--st->bucket < 0)
297 break;
298 rcu_read_lock_bh();
299 r = rt_hash_table[st->bucket].chain;
300 }
301 return r;
302}
303
304static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
305{
306 struct rtable *r = rt_cache_get_first(seq);
307
308 if (r)
309 while (pos && (r = rt_cache_get_next(seq, r)))
310 --pos;
311 return pos ? NULL : r;
312}
313
314static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
315{
316 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
317}
318
319static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
320{
321 struct rtable *r = NULL;
322
323 if (v == SEQ_START_TOKEN)
324 r = rt_cache_get_first(seq);
325 else
326 r = rt_cache_get_next(seq, v);
327 ++*pos;
328 return r;
329}
330
331static void rt_cache_seq_stop(struct seq_file *seq, void *v)
332{
333 if (v && v != SEQ_START_TOKEN)
334 rcu_read_unlock_bh();
335}
336
337static int rt_cache_seq_show(struct seq_file *seq, void *v)
338{
339 if (v == SEQ_START_TOKEN)
340 seq_printf(seq, "%-127s\n",
341 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
342 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
343 "HHUptod\tSpecDst");
344 else {
345 struct rtable *r = v;
346 char temp[256];
347
348 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
349 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
350 r->u.dst.dev ? r->u.dst.dev->name : "*",
351 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
352 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
353 r->u.dst.__use, 0, (unsigned long)r->rt_src,
354 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
355 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
356 dst_metric(&r->u.dst, RTAX_WINDOW),
357 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
358 dst_metric(&r->u.dst, RTAX_RTTVAR)),
359 r->fl.fl4_tos,
360 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
361 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
362 dev_queue_xmit) : 0,
363 r->rt_spec_dst);
364 seq_printf(seq, "%-127s\n", temp);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900365 }
366 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700367}
368
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700369static const struct seq_operations rt_cache_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700370 .start = rt_cache_seq_start,
371 .next = rt_cache_seq_next,
372 .stop = rt_cache_seq_stop,
373 .show = rt_cache_seq_show,
374};
375
376static int rt_cache_seq_open(struct inode *inode, struct file *file)
377{
Pavel Emelyanovcf7732e2007-10-10 02:29:29 -0700378 return seq_open_private(file, &rt_cache_seq_ops,
379 sizeof(struct rt_cache_iter_state));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700380}
381
Arjan van de Ven9a321442007-02-12 00:55:35 -0800382static const struct file_operations rt_cache_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700383 .owner = THIS_MODULE,
384 .open = rt_cache_seq_open,
385 .read = seq_read,
386 .llseek = seq_lseek,
387 .release = seq_release_private,
388};
389
390
391static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
392{
393 int cpu;
394
395 if (*pos == 0)
396 return SEQ_START_TOKEN;
397
398 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
399 if (!cpu_possible(cpu))
400 continue;
401 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800402 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700403 }
404 return NULL;
405}
406
407static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
408{
409 int cpu;
410
411 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
412 if (!cpu_possible(cpu))
413 continue;
414 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800415 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700416 }
417 return NULL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900418
Linus Torvalds1da177e2005-04-16 15:20:36 -0700419}
420
421static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
422{
423
424}
425
426static int rt_cpu_seq_show(struct seq_file *seq, void *v)
427{
428 struct rt_cache_stat *st = v;
429
430 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700431 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700432 return 0;
433 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900434
Linus Torvalds1da177e2005-04-16 15:20:36 -0700435 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
436 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
437 atomic_read(&ipv4_dst_ops.entries),
438 st->in_hit,
439 st->in_slow_tot,
440 st->in_slow_mc,
441 st->in_no_route,
442 st->in_brd,
443 st->in_martian_dst,
444 st->in_martian_src,
445
446 st->out_hit,
447 st->out_slow_tot,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900448 st->out_slow_mc,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700449
450 st->gc_total,
451 st->gc_ignored,
452 st->gc_goal_miss,
453 st->gc_dst_overflow,
454 st->in_hlist_search,
455 st->out_hlist_search
456 );
457 return 0;
458}
459
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700460static const struct seq_operations rt_cpu_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700461 .start = rt_cpu_seq_start,
462 .next = rt_cpu_seq_next,
463 .stop = rt_cpu_seq_stop,
464 .show = rt_cpu_seq_show,
465};
466
467
468static int rt_cpu_seq_open(struct inode *inode, struct file *file)
469{
470 return seq_open(file, &rt_cpu_seq_ops);
471}
472
Arjan van de Ven9a321442007-02-12 00:55:35 -0800473static const struct file_operations rt_cpu_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700474 .owner = THIS_MODULE,
475 .open = rt_cpu_seq_open,
476 .read = seq_read,
477 .llseek = seq_lseek,
478 .release = seq_release,
479};
480
481#endif /* CONFIG_PROC_FS */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900482
Linus Torvalds1da177e2005-04-16 15:20:36 -0700483static __inline__ void rt_free(struct rtable *rt)
484{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700485 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
486}
487
488static __inline__ void rt_drop(struct rtable *rt)
489{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700490 ip_rt_put(rt);
491 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
492}
493
494static __inline__ int rt_fast_clean(struct rtable *rth)
495{
496 /* Kill broadcast/multicast entries very aggresively, if they
497 collide in hash table with more useful entries */
498 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800499 rth->fl.iif && rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700500}
501
502static __inline__ int rt_valuable(struct rtable *rth)
503{
504 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
505 rth->u.dst.expires;
506}
507
508static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
509{
510 unsigned long age;
511 int ret = 0;
512
513 if (atomic_read(&rth->u.dst.__refcnt))
514 goto out;
515
516 ret = 1;
517 if (rth->u.dst.expires &&
518 time_after_eq(jiffies, rth->u.dst.expires))
519 goto out;
520
521 age = jiffies - rth->u.dst.lastuse;
522 ret = 0;
523 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
524 (age <= tmo2 && rt_valuable(rth)))
525 goto out;
526 ret = 1;
527out: return ret;
528}
529
530/* Bits of score are:
531 * 31: very valuable
532 * 30: not quite useless
533 * 29..0: usage counter
534 */
535static inline u32 rt_score(struct rtable *rt)
536{
537 u32 score = jiffies - rt->u.dst.lastuse;
538
539 score = ~score & ~(3<<30);
540
541 if (rt_valuable(rt))
542 score |= (1<<31);
543
544 if (!rt->fl.iif ||
545 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
546 score |= (1<<30);
547
548 return score;
549}
550
551static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
552{
Al Viro714e85b2006-11-14 20:51:49 -0800553 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
554 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
Thomas Graf47dcf0c2006-11-09 15:20:38 -0800555 (fl1->mark ^ fl2->mark) |
David S. Miller8238b212006-10-12 00:49:15 -0700556 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
557 *(u16 *)&fl2->nl_u.ip4_u.tos) |
558 (fl1->oif ^ fl2->oif) |
559 (fl1->iif ^ fl2->iif)) == 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700560}
561
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700562static void rt_check_expire(struct work_struct *work)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700563{
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700564 static unsigned int rover;
565 unsigned int i = rover, goal;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700566 struct rtable *rth, **rthp;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700567 u64 mult;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700568
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700569 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
570 if (ip_rt_gc_timeout > 1)
571 do_div(mult, ip_rt_gc_timeout);
572 goal = (unsigned int)mult;
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700573 if (goal > rt_hash_mask)
574 goal = rt_hash_mask + 1;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700575 for (; goal > 0; goal--) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700576 unsigned long tmo = ip_rt_gc_timeout;
577
578 i = (i + 1) & rt_hash_mask;
579 rthp = &rt_hash_table[i].chain;
580
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700581 if (*rthp == NULL)
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700582 continue;
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700583 spin_lock_bh(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700584 while ((rth = *rthp) != NULL) {
585 if (rth->u.dst.expires) {
586 /* Entry is expired even if it is in use */
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700587 if (time_before_eq(jiffies, rth->u.dst.expires)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700588 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800589 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700590 continue;
591 }
592 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
593 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800594 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700595 continue;
596 }
597
598 /* Cleanup aged off entries. */
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800599 *rthp = rth->u.dst.rt_next;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900600 rt_free(rth);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700601 }
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700602 spin_unlock_bh(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700603 }
604 rover = i;
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700605 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700606}
607
608/* This can run from both BH and non-BH contexts, the latter
609 * in the case of a forced flush event.
610 */
611static void rt_run_flush(unsigned long dummy)
612{
613 int i;
614 struct rtable *rth, *next;
615
616 rt_deadline = 0;
617
618 get_random_bytes(&rt_hash_rnd, 4);
619
620 for (i = rt_hash_mask; i >= 0; i--) {
Eric Dumazet22c047c2005-07-05 14:55:24 -0700621 spin_lock_bh(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700622 rth = rt_hash_table[i].chain;
623 if (rth)
624 rt_hash_table[i].chain = NULL;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700625 spin_unlock_bh(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700626
627 for (; rth; rth = next) {
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800628 next = rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700629 rt_free(rth);
630 }
631 }
632}
633
634static DEFINE_SPINLOCK(rt_flush_lock);
635
636void rt_cache_flush(int delay)
637{
638 unsigned long now = jiffies;
639 int user_mode = !in_softirq();
640
641 if (delay < 0)
642 delay = ip_rt_min_delay;
643
Linus Torvalds1da177e2005-04-16 15:20:36 -0700644 spin_lock_bh(&rt_flush_lock);
645
646 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
647 long tmo = (long)(rt_deadline - now);
648
649 /* If flush timer is already running
650 and flush request is not immediate (delay > 0):
651
652 if deadline is not achieved, prolongate timer to "delay",
653 otherwise fire it at deadline time.
654 */
655
656 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
657 tmo = 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900658
Linus Torvalds1da177e2005-04-16 15:20:36 -0700659 if (delay > tmo)
660 delay = tmo;
661 }
662
663 if (delay <= 0) {
664 spin_unlock_bh(&rt_flush_lock);
665 rt_run_flush(0);
666 return;
667 }
668
669 if (rt_deadline == 0)
670 rt_deadline = now + ip_rt_max_delay;
671
672 mod_timer(&rt_flush_timer, now+delay);
673 spin_unlock_bh(&rt_flush_lock);
674}
675
676static void rt_secret_rebuild(unsigned long dummy)
677{
678 unsigned long now = jiffies;
679
680 rt_cache_flush(0);
681 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
682}
683
684/*
685 Short description of GC goals.
686
687 We want to build algorithm, which will keep routing cache
688 at some equilibrium point, when number of aged off entries
689 is kept approximately equal to newly generated ones.
690
691 Current expiration strength is variable "expire".
692 We try to adjust it dynamically, so that if networking
693 is idle expires is large enough to keep enough of warm entries,
694 and when load increases it reduces to limit cache size.
695 */
696
697static int rt_garbage_collect(void)
698{
699 static unsigned long expire = RT_GC_TIMEOUT;
700 static unsigned long last_gc;
701 static int rover;
702 static int equilibrium;
703 struct rtable *rth, **rthp;
704 unsigned long now = jiffies;
705 int goal;
706
707 /*
708 * Garbage collection is pretty expensive,
709 * do not make it too frequently.
710 */
711
712 RT_CACHE_STAT_INC(gc_total);
713
714 if (now - last_gc < ip_rt_gc_min_interval &&
715 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
716 RT_CACHE_STAT_INC(gc_ignored);
717 goto out;
718 }
719
720 /* Calculate number of entries, which we want to expire now. */
721 goal = atomic_read(&ipv4_dst_ops.entries) -
722 (ip_rt_gc_elasticity << rt_hash_log);
723 if (goal <= 0) {
724 if (equilibrium < ipv4_dst_ops.gc_thresh)
725 equilibrium = ipv4_dst_ops.gc_thresh;
726 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
727 if (goal > 0) {
728 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
729 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
730 }
731 } else {
732 /* We are in dangerous area. Try to reduce cache really
733 * aggressively.
734 */
735 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
736 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
737 }
738
739 if (now - last_gc >= ip_rt_gc_min_interval)
740 last_gc = now;
741
742 if (goal <= 0) {
743 equilibrium += goal;
744 goto work_done;
745 }
746
747 do {
748 int i, k;
749
750 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
751 unsigned long tmo = expire;
752
753 k = (k + 1) & rt_hash_mask;
754 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700755 spin_lock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700756 while ((rth = *rthp) != NULL) {
757 if (!rt_may_expire(rth, tmo, expire)) {
758 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800759 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700760 continue;
761 }
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800762 *rthp = rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700763 rt_free(rth);
764 goal--;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700765 }
Eric Dumazet22c047c2005-07-05 14:55:24 -0700766 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700767 if (goal <= 0)
768 break;
769 }
770 rover = k;
771
772 if (goal <= 0)
773 goto work_done;
774
775 /* Goal is not achieved. We stop process if:
776
777 - if expire reduced to zero. Otherwise, expire is halfed.
778 - if table is not full.
779 - if we are called from interrupt.
780 - jiffies check is just fallback/debug loop breaker.
781 We will not spin here for long time in any case.
782 */
783
784 RT_CACHE_STAT_INC(gc_goal_miss);
785
786 if (expire == 0)
787 break;
788
789 expire >>= 1;
790#if RT_CACHE_DEBUG >= 2
791 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
792 atomic_read(&ipv4_dst_ops.entries), goal, i);
793#endif
794
795 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
796 goto out;
797 } while (!in_softirq() && time_before_eq(jiffies, now));
798
799 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
800 goto out;
801 if (net_ratelimit())
802 printk(KERN_WARNING "dst cache overflow\n");
803 RT_CACHE_STAT_INC(gc_dst_overflow);
804 return 1;
805
806work_done:
807 expire += ip_rt_gc_min_interval;
808 if (expire > ip_rt_gc_timeout ||
809 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
810 expire = ip_rt_gc_timeout;
811#if RT_CACHE_DEBUG >= 2
812 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
813 atomic_read(&ipv4_dst_ops.entries), goal, rover);
814#endif
815out: return 0;
816}
817
818static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
819{
820 struct rtable *rth, **rthp;
821 unsigned long now;
822 struct rtable *cand, **candp;
823 u32 min_score;
824 int chain_length;
825 int attempts = !in_softirq();
826
827restart:
828 chain_length = 0;
829 min_score = ~(u32)0;
830 cand = NULL;
831 candp = NULL;
832 now = jiffies;
833
834 rthp = &rt_hash_table[hash].chain;
835
Eric Dumazet22c047c2005-07-05 14:55:24 -0700836 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700837 while ((rth = *rthp) != NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700838 if (compare_keys(&rth->fl, &rt->fl)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700839 /* Put it first */
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800840 *rthp = rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700841 /*
842 * Since lookup is lockfree, the deletion
843 * must be visible to another weakly ordered CPU before
844 * the insertion at the start of the hash chain.
845 */
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800846 rcu_assign_pointer(rth->u.dst.rt_next,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700847 rt_hash_table[hash].chain);
848 /*
849 * Since lookup is lockfree, the update writes
850 * must be ordered for consistency on SMP.
851 */
852 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
853
854 rth->u.dst.__use++;
855 dst_hold(&rth->u.dst);
856 rth->u.dst.lastuse = now;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700857 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700858
859 rt_drop(rt);
860 *rp = rth;
861 return 0;
862 }
863
864 if (!atomic_read(&rth->u.dst.__refcnt)) {
865 u32 score = rt_score(rth);
866
867 if (score <= min_score) {
868 cand = rth;
869 candp = rthp;
870 min_score = score;
871 }
872 }
873
874 chain_length++;
875
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800876 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700877 }
878
879 if (cand) {
880 /* ip_rt_gc_elasticity used to be average length of chain
881 * length, when exceeded gc becomes really aggressive.
882 *
883 * The second limit is less certain. At the moment it allows
884 * only 2 entries per bucket. We will see.
885 */
886 if (chain_length > ip_rt_gc_elasticity) {
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800887 *candp = cand->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700888 rt_free(cand);
889 }
890 }
891
892 /* Try to bind route to arp only if it is output
893 route or unicast forwarding path.
894 */
895 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
896 int err = arp_bind_neighbour(&rt->u.dst);
897 if (err) {
Eric Dumazet22c047c2005-07-05 14:55:24 -0700898 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700899
900 if (err != -ENOBUFS) {
901 rt_drop(rt);
902 return err;
903 }
904
905 /* Neighbour tables are full and nothing
906 can be released. Try to shrink route cache,
907 it is most likely it holds some neighbour records.
908 */
909 if (attempts-- > 0) {
910 int saved_elasticity = ip_rt_gc_elasticity;
911 int saved_int = ip_rt_gc_min_interval;
912 ip_rt_gc_elasticity = 1;
913 ip_rt_gc_min_interval = 0;
914 rt_garbage_collect();
915 ip_rt_gc_min_interval = saved_int;
916 ip_rt_gc_elasticity = saved_elasticity;
917 goto restart;
918 }
919
920 if (net_ratelimit())
921 printk(KERN_WARNING "Neighbour table overflow.\n");
922 rt_drop(rt);
923 return -ENOBUFS;
924 }
925 }
926
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800927 rt->u.dst.rt_next = rt_hash_table[hash].chain;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700928#if RT_CACHE_DEBUG >= 2
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800929 if (rt->u.dst.rt_next) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700930 struct rtable *trt;
931 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
932 NIPQUAD(rt->rt_dst));
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800933 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700934 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
935 printk("\n");
936 }
937#endif
938 rt_hash_table[hash].chain = rt;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700939 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700940 *rp = rt;
941 return 0;
942}
943
944void rt_bind_peer(struct rtable *rt, int create)
945{
946 static DEFINE_SPINLOCK(rt_peer_lock);
947 struct inet_peer *peer;
948
949 peer = inet_getpeer(rt->rt_dst, create);
950
951 spin_lock_bh(&rt_peer_lock);
952 if (rt->peer == NULL) {
953 rt->peer = peer;
954 peer = NULL;
955 }
956 spin_unlock_bh(&rt_peer_lock);
957 if (peer)
958 inet_putpeer(peer);
959}
960
961/*
962 * Peer allocation may fail only in serious out-of-memory conditions. However
963 * we still can generate some output.
964 * Random ID selection looks a bit dangerous because we have no chances to
965 * select ID being unique in a reasonable period of time.
966 * But broken packet identifier may be better than no packet at all.
967 */
968static void ip_select_fb_ident(struct iphdr *iph)
969{
970 static DEFINE_SPINLOCK(ip_fb_id_lock);
971 static u32 ip_fallback_id;
972 u32 salt;
973
974 spin_lock_bh(&ip_fb_id_lock);
Al Viroe4485152006-09-26 22:15:01 -0700975 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700976 iph->id = htons(salt & 0xFFFF);
977 ip_fallback_id = salt;
978 spin_unlock_bh(&ip_fb_id_lock);
979}
980
981void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
982{
983 struct rtable *rt = (struct rtable *) dst;
984
985 if (rt) {
986 if (rt->peer == NULL)
987 rt_bind_peer(rt, 1);
988
989 /* If peer is attached to destination, it is never detached,
990 so that we need not to grab a lock to dereference it.
991 */
992 if (rt->peer) {
993 iph->id = htons(inet_getid(rt->peer, more));
994 return;
995 }
996 } else
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900997 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
Stephen Hemminger9c2b3322005-04-19 22:39:42 -0700998 __builtin_return_address(0));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700999
1000 ip_select_fb_ident(iph);
1001}
1002
1003static void rt_del(unsigned hash, struct rtable *rt)
1004{
1005 struct rtable **rthp;
1006
Eric Dumazet22c047c2005-07-05 14:55:24 -07001007 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001008 ip_rt_put(rt);
1009 for (rthp = &rt_hash_table[hash].chain; *rthp;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001010 rthp = &(*rthp)->u.dst.rt_next)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001011 if (*rthp == rt) {
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001012 *rthp = rt->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001013 rt_free(rt);
1014 break;
1015 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001016 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001017}
1018
Al Virof7655222006-09-26 21:25:43 -07001019void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1020 __be32 saddr, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001021{
1022 int i, k;
1023 struct in_device *in_dev = in_dev_get(dev);
1024 struct rtable *rth, **rthp;
Al Virof7655222006-09-26 21:25:43 -07001025 __be32 skeys[2] = { saddr, 0 };
Linus Torvalds1da177e2005-04-16 15:20:36 -07001026 int ikeys[2] = { dev->ifindex, 0 };
Tom Tucker8d717402006-07-30 20:43:36 -07001027 struct netevent_redirect netevent;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001028
Linus Torvalds1da177e2005-04-16 15:20:36 -07001029 if (!in_dev)
1030 return;
1031
1032 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1033 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1034 goto reject_redirect;
1035
1036 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1037 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1038 goto reject_redirect;
1039 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1040 goto reject_redirect;
1041 } else {
1042 if (inet_addr_type(new_gw) != RTN_UNICAST)
1043 goto reject_redirect;
1044 }
1045
1046 for (i = 0; i < 2; i++) {
1047 for (k = 0; k < 2; k++) {
Al Viro8c7bc842006-09-26 21:26:19 -07001048 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001049
1050 rthp=&rt_hash_table[hash].chain;
1051
1052 rcu_read_lock();
1053 while ((rth = rcu_dereference(*rthp)) != NULL) {
1054 struct rtable *rt;
1055
1056 if (rth->fl.fl4_dst != daddr ||
1057 rth->fl.fl4_src != skeys[i] ||
Linus Torvalds1da177e2005-04-16 15:20:36 -07001058 rth->fl.oif != ikeys[k] ||
1059 rth->fl.iif != 0) {
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001060 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001061 continue;
1062 }
1063
1064 if (rth->rt_dst != daddr ||
1065 rth->rt_src != saddr ||
1066 rth->u.dst.error ||
1067 rth->rt_gateway != old_gw ||
1068 rth->u.dst.dev != dev)
1069 break;
1070
1071 dst_hold(&rth->u.dst);
1072 rcu_read_unlock();
1073
1074 rt = dst_alloc(&ipv4_dst_ops);
1075 if (rt == NULL) {
1076 ip_rt_put(rth);
1077 in_dev_put(in_dev);
1078 return;
1079 }
1080
1081 /* Copy all the information. */
1082 *rt = *rth;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001083 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001084 rt->u.dst.__use = 1;
1085 atomic_set(&rt->u.dst.__refcnt, 1);
1086 rt->u.dst.child = NULL;
1087 if (rt->u.dst.dev)
1088 dev_hold(rt->u.dst.dev);
1089 if (rt->idev)
1090 in_dev_hold(rt->idev);
1091 rt->u.dst.obsolete = 0;
1092 rt->u.dst.lastuse = jiffies;
1093 rt->u.dst.path = &rt->u.dst;
1094 rt->u.dst.neighbour = NULL;
1095 rt->u.dst.hh = NULL;
1096 rt->u.dst.xfrm = NULL;
1097
1098 rt->rt_flags |= RTCF_REDIRECTED;
1099
1100 /* Gateway is different ... */
1101 rt->rt_gateway = new_gw;
1102
1103 /* Redirect received -> path was valid */
1104 dst_confirm(&rth->u.dst);
1105
1106 if (rt->peer)
1107 atomic_inc(&rt->peer->refcnt);
1108
1109 if (arp_bind_neighbour(&rt->u.dst) ||
1110 !(rt->u.dst.neighbour->nud_state &
1111 NUD_VALID)) {
1112 if (rt->u.dst.neighbour)
1113 neigh_event_send(rt->u.dst.neighbour, NULL);
1114 ip_rt_put(rth);
1115 rt_drop(rt);
1116 goto do_next;
1117 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001118
Tom Tucker8d717402006-07-30 20:43:36 -07001119 netevent.old = &rth->u.dst;
1120 netevent.new = &rt->u.dst;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001121 call_netevent_notifiers(NETEVENT_REDIRECT,
1122 &netevent);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001123
1124 rt_del(hash, rth);
1125 if (!rt_intern_hash(hash, rt, &rt))
1126 ip_rt_put(rt);
1127 goto do_next;
1128 }
1129 rcu_read_unlock();
1130 do_next:
1131 ;
1132 }
1133 }
1134 in_dev_put(in_dev);
1135 return;
1136
1137reject_redirect:
1138#ifdef CONFIG_IP_ROUTE_VERBOSE
1139 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1140 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1141 "%u.%u.%u.%u ignored.\n"
Ilia Sotnikovcef26852006-03-25 01:38:55 -08001142 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07001143 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
Ilia Sotnikovcef26852006-03-25 01:38:55 -08001144 NIPQUAD(saddr), NIPQUAD(daddr));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001145#endif
1146 in_dev_put(in_dev);
1147}
1148
1149static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1150{
1151 struct rtable *rt = (struct rtable*)dst;
1152 struct dst_entry *ret = dst;
1153
1154 if (rt) {
1155 if (dst->obsolete) {
1156 ip_rt_put(rt);
1157 ret = NULL;
1158 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1159 rt->u.dst.expires) {
Al Viro8c7bc842006-09-26 21:26:19 -07001160 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1161 rt->fl.oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001162#if RT_CACHE_DEBUG >= 1
1163 printk(KERN_DEBUG "ip_rt_advice: redirect to "
1164 "%u.%u.%u.%u/%02x dropped\n",
1165 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1166#endif
1167 rt_del(hash, rt);
1168 ret = NULL;
1169 }
1170 }
1171 return ret;
1172}
1173
1174/*
1175 * Algorithm:
1176 * 1. The first ip_rt_redirect_number redirects are sent
1177 * with exponential backoff, then we stop sending them at all,
1178 * assuming that the host ignores our redirects.
1179 * 2. If we did not see packets requiring redirects
1180 * during ip_rt_redirect_silence, we assume that the host
1181 * forgot redirected route and start to send redirects again.
1182 *
1183 * This algorithm is much cheaper and more intelligent than dumb load limiting
1184 * in icmp.c.
1185 *
1186 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1187 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1188 */
1189
1190void ip_rt_send_redirect(struct sk_buff *skb)
1191{
1192 struct rtable *rt = (struct rtable*)skb->dst;
1193 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1194
1195 if (!in_dev)
1196 return;
1197
1198 if (!IN_DEV_TX_REDIRECTS(in_dev))
1199 goto out;
1200
1201 /* No redirected packets during ip_rt_redirect_silence;
1202 * reset the algorithm.
1203 */
1204 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1205 rt->u.dst.rate_tokens = 0;
1206
1207 /* Too many ignored redirects; do not send anything
1208 * set u.dst.rate_last to the last seen redirected packet.
1209 */
1210 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1211 rt->u.dst.rate_last = jiffies;
1212 goto out;
1213 }
1214
1215 /* Check for load limit; set rate_last to the latest sent
1216 * redirect.
1217 */
Li Yewang14fb8a72006-12-18 00:26:35 -08001218 if (rt->u.dst.rate_tokens == 0 ||
1219 time_after(jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001220 (rt->u.dst.rate_last +
1221 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1222 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1223 rt->u.dst.rate_last = jiffies;
1224 ++rt->u.dst.rate_tokens;
1225#ifdef CONFIG_IP_ROUTE_VERBOSE
1226 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1227 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1228 net_ratelimit())
1229 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1230 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1231 NIPQUAD(rt->rt_src), rt->rt_iif,
1232 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1233#endif
1234 }
1235out:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001236 in_dev_put(in_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001237}
1238
1239static int ip_error(struct sk_buff *skb)
1240{
1241 struct rtable *rt = (struct rtable*)skb->dst;
1242 unsigned long now;
1243 int code;
1244
1245 switch (rt->u.dst.error) {
1246 case EINVAL:
1247 default:
1248 goto out;
1249 case EHOSTUNREACH:
1250 code = ICMP_HOST_UNREACH;
1251 break;
1252 case ENETUNREACH:
1253 code = ICMP_NET_UNREACH;
1254 break;
1255 case EACCES:
1256 code = ICMP_PKT_FILTERED;
1257 break;
1258 }
1259
1260 now = jiffies;
1261 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1262 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1263 rt->u.dst.rate_tokens = ip_rt_error_burst;
1264 rt->u.dst.rate_last = now;
1265 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1266 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1267 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1268 }
1269
1270out: kfree_skb(skb);
1271 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001272}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001273
1274/*
1275 * The last two values are not from the RFC but
1276 * are needed for AMPRnet AX.25 paths.
1277 */
1278
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001279static const unsigned short mtu_plateau[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -07001280{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1281
1282static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1283{
1284 int i;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001285
Linus Torvalds1da177e2005-04-16 15:20:36 -07001286 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1287 if (old_mtu > mtu_plateau[i])
1288 return mtu_plateau[i];
1289 return 68;
1290}
1291
1292unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1293{
1294 int i;
1295 unsigned short old_mtu = ntohs(iph->tot_len);
1296 struct rtable *rth;
Al Viroe4485152006-09-26 22:15:01 -07001297 __be32 skeys[2] = { iph->saddr, 0, };
1298 __be32 daddr = iph->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001299 unsigned short est_mtu = 0;
1300
1301 if (ipv4_config.no_pmtu_disc)
1302 return 0;
1303
1304 for (i = 0; i < 2; i++) {
Al Viro8c7bc842006-09-26 21:26:19 -07001305 unsigned hash = rt_hash(daddr, skeys[i], 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001306
1307 rcu_read_lock();
1308 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001309 rth = rcu_dereference(rth->u.dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001310 if (rth->fl.fl4_dst == daddr &&
1311 rth->fl.fl4_src == skeys[i] &&
1312 rth->rt_dst == daddr &&
1313 rth->rt_src == iph->saddr &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001314 rth->fl.iif == 0 &&
1315 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1316 unsigned short mtu = new_mtu;
1317
1318 if (new_mtu < 68 || new_mtu >= old_mtu) {
1319
1320 /* BSD 4.2 compatibility hack :-( */
1321 if (mtu == 0 &&
1322 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1323 old_mtu >= 68 + (iph->ihl << 2))
1324 old_mtu -= iph->ihl << 2;
1325
1326 mtu = guess_mtu(old_mtu);
1327 }
1328 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001329 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001330 dst_confirm(&rth->u.dst);
1331 if (mtu < ip_rt_min_pmtu) {
1332 mtu = ip_rt_min_pmtu;
1333 rth->u.dst.metrics[RTAX_LOCK-1] |=
1334 (1 << RTAX_MTU);
1335 }
1336 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1337 dst_set_expires(&rth->u.dst,
1338 ip_rt_mtu_expires);
1339 }
1340 est_mtu = mtu;
1341 }
1342 }
1343 }
1344 rcu_read_unlock();
1345 }
1346 return est_mtu ? : new_mtu;
1347}
1348
1349static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1350{
1351 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1352 !(dst_metric_locked(dst, RTAX_MTU))) {
1353 if (mtu < ip_rt_min_pmtu) {
1354 mtu = ip_rt_min_pmtu;
1355 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1356 }
1357 dst->metrics[RTAX_MTU-1] = mtu;
1358 dst_set_expires(dst, ip_rt_mtu_expires);
Tom Tucker8d717402006-07-30 20:43:36 -07001359 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001360 }
1361}
1362
1363static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1364{
1365 return NULL;
1366}
1367
1368static void ipv4_dst_destroy(struct dst_entry *dst)
1369{
1370 struct rtable *rt = (struct rtable *) dst;
1371 struct inet_peer *peer = rt->peer;
1372 struct in_device *idev = rt->idev;
1373
1374 if (peer) {
1375 rt->peer = NULL;
1376 inet_putpeer(peer);
1377 }
1378
1379 if (idev) {
1380 rt->idev = NULL;
1381 in_dev_put(idev);
1382 }
1383}
1384
1385static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1386 int how)
1387{
1388 struct rtable *rt = (struct rtable *) dst;
1389 struct in_device *idev = rt->idev;
Eric W. Biederman2774c7a2007-09-26 22:10:56 -07001390 if (dev != init_net.loopback_dev && idev && idev->dev == dev) {
1391 struct in_device *loopback_idev = in_dev_get(init_net.loopback_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001392 if (loopback_idev) {
1393 rt->idev = loopback_idev;
1394 in_dev_put(idev);
1395 }
1396 }
1397}
1398
1399static void ipv4_link_failure(struct sk_buff *skb)
1400{
1401 struct rtable *rt;
1402
1403 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1404
1405 rt = (struct rtable *) skb->dst;
1406 if (rt)
1407 dst_set_expires(&rt->u.dst, 0);
1408}
1409
1410static int ip_rt_bug(struct sk_buff *skb)
1411{
1412 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001413 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
Linus Torvalds1da177e2005-04-16 15:20:36 -07001414 skb->dev ? skb->dev->name : "?");
1415 kfree_skb(skb);
1416 return 0;
1417}
1418
1419/*
1420 We do not cache source address of outgoing interface,
1421 because it is used only by IP RR, TS and SRR options,
1422 so that it out of fast path.
1423
1424 BTW remember: "addr" is allowed to be not aligned
1425 in IP options!
1426 */
1427
1428void ip_rt_get_source(u8 *addr, struct rtable *rt)
1429{
Al Viroa61ced52006-09-26 21:27:54 -07001430 __be32 src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001431 struct fib_result res;
1432
1433 if (rt->fl.iif == 0)
1434 src = rt->rt_src;
1435 else if (fib_lookup(&rt->fl, &res) == 0) {
1436 src = FIB_RES_PREFSRC(res);
1437 fib_res_put(&res);
1438 } else
1439 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1440 RT_SCOPE_UNIVERSE);
1441 memcpy(addr, &src, 4);
1442}
1443
1444#ifdef CONFIG_NET_CLS_ROUTE
1445static void set_class_tag(struct rtable *rt, u32 tag)
1446{
1447 if (!(rt->u.dst.tclassid & 0xFFFF))
1448 rt->u.dst.tclassid |= tag & 0xFFFF;
1449 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1450 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1451}
1452#endif
1453
1454static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1455{
1456 struct fib_info *fi = res->fi;
1457
1458 if (fi) {
1459 if (FIB_RES_GW(*res) &&
1460 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1461 rt->rt_gateway = FIB_RES_GW(*res);
1462 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1463 sizeof(rt->u.dst.metrics));
1464 if (fi->fib_mtu == 0) {
1465 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1466 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1467 rt->rt_gateway != rt->rt_dst &&
1468 rt->u.dst.dev->mtu > 576)
1469 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1470 }
1471#ifdef CONFIG_NET_CLS_ROUTE
1472 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1473#endif
1474 } else
1475 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1476
1477 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1478 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1479 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1480 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1481 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1482 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1483 ip_rt_min_advmss);
1484 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1485 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1486
1487#ifdef CONFIG_NET_CLS_ROUTE
1488#ifdef CONFIG_IP_MULTIPLE_TABLES
1489 set_class_tag(rt, fib_rules_tclass(res));
1490#endif
1491 set_class_tag(rt, itag);
1492#endif
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001493 rt->rt_type = res->type;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001494}
1495
Al Viro9e12bb22006-09-26 21:25:20 -07001496static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001497 u8 tos, struct net_device *dev, int our)
1498{
1499 unsigned hash;
1500 struct rtable *rth;
Al Viroa61ced52006-09-26 21:27:54 -07001501 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001502 struct in_device *in_dev = in_dev_get(dev);
1503 u32 itag = 0;
1504
1505 /* Primary sanity checks. */
1506
1507 if (in_dev == NULL)
1508 return -EINVAL;
1509
1510 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1511 skb->protocol != htons(ETH_P_IP))
1512 goto e_inval;
1513
1514 if (ZERONET(saddr)) {
1515 if (!LOCAL_MCAST(daddr))
1516 goto e_inval;
1517 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1518 } else if (fib_validate_source(saddr, 0, tos, 0,
1519 dev, &spec_dst, &itag) < 0)
1520 goto e_inval;
1521
1522 rth = dst_alloc(&ipv4_dst_ops);
1523 if (!rth)
1524 goto e_nobufs;
1525
1526 rth->u.dst.output= ip_rt_bug;
1527
1528 atomic_set(&rth->u.dst.__refcnt, 1);
1529 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07001530 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001531 rth->u.dst.flags |= DST_NOPOLICY;
1532 rth->fl.fl4_dst = daddr;
1533 rth->rt_dst = daddr;
1534 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001535 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001536 rth->fl.fl4_src = saddr;
1537 rth->rt_src = saddr;
1538#ifdef CONFIG_NET_CLS_ROUTE
1539 rth->u.dst.tclassid = itag;
1540#endif
1541 rth->rt_iif =
1542 rth->fl.iif = dev->ifindex;
Eric W. Biederman2774c7a2007-09-26 22:10:56 -07001543 rth->u.dst.dev = init_net.loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001544 dev_hold(rth->u.dst.dev);
1545 rth->idev = in_dev_get(rth->u.dst.dev);
1546 rth->fl.oif = 0;
1547 rth->rt_gateway = daddr;
1548 rth->rt_spec_dst= spec_dst;
1549 rth->rt_type = RTN_MULTICAST;
1550 rth->rt_flags = RTCF_MULTICAST;
1551 if (our) {
1552 rth->u.dst.input= ip_local_deliver;
1553 rth->rt_flags |= RTCF_LOCAL;
1554 }
1555
1556#ifdef CONFIG_IP_MROUTE
1557 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1558 rth->u.dst.input = ip_mr_input;
1559#endif
1560 RT_CACHE_STAT_INC(in_slow_mc);
1561
1562 in_dev_put(in_dev);
Al Viro8c7bc842006-09-26 21:26:19 -07001563 hash = rt_hash(daddr, saddr, dev->ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001564 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1565
1566e_nobufs:
1567 in_dev_put(in_dev);
1568 return -ENOBUFS;
1569
1570e_inval:
1571 in_dev_put(in_dev);
1572 return -EINVAL;
1573}
1574
1575
1576static void ip_handle_martian_source(struct net_device *dev,
1577 struct in_device *in_dev,
1578 struct sk_buff *skb,
Al Viro9e12bb22006-09-26 21:25:20 -07001579 __be32 daddr,
1580 __be32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001581{
1582 RT_CACHE_STAT_INC(in_martian_src);
1583#ifdef CONFIG_IP_ROUTE_VERBOSE
1584 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1585 /*
1586 * RFC1812 recommendation, if source is martian,
1587 * the only hint is MAC header.
1588 */
1589 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1590 "%u.%u.%u.%u, on dev %s\n",
1591 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001592 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001593 int i;
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001594 const unsigned char *p = skb_mac_header(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001595 printk(KERN_WARNING "ll header: ");
1596 for (i = 0; i < dev->hard_header_len; i++, p++) {
1597 printk("%02x", *p);
1598 if (i < (dev->hard_header_len - 1))
1599 printk(":");
1600 }
1601 printk("\n");
1602 }
1603 }
1604#endif
1605}
1606
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001607static inline int __mkroute_input(struct sk_buff *skb,
1608 struct fib_result* res,
1609 struct in_device *in_dev,
Al Viro9e12bb22006-09-26 21:25:20 -07001610 __be32 daddr, __be32 saddr, u32 tos,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001611 struct rtable **result)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001612{
1613
1614 struct rtable *rth;
1615 int err;
1616 struct in_device *out_dev;
1617 unsigned flags = 0;
Al Virod9c9df82006-09-26 21:28:14 -07001618 __be32 spec_dst;
1619 u32 itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001620
1621 /* get a working reference to the output device */
1622 out_dev = in_dev_get(FIB_RES_DEV(*res));
1623 if (out_dev == NULL) {
1624 if (net_ratelimit())
1625 printk(KERN_CRIT "Bug in ip_route_input" \
1626 "_slow(). Please, report\n");
1627 return -EINVAL;
1628 }
1629
1630
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001631 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
Linus Torvalds1da177e2005-04-16 15:20:36 -07001632 in_dev->dev, &spec_dst, &itag);
1633 if (err < 0) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001634 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001635 saddr);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001636
Linus Torvalds1da177e2005-04-16 15:20:36 -07001637 err = -EINVAL;
1638 goto cleanup;
1639 }
1640
1641 if (err)
1642 flags |= RTCF_DIRECTSRC;
1643
1644 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1645 (IN_DEV_SHARED_MEDIA(out_dev) ||
1646 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1647 flags |= RTCF_DOREDIRECT;
1648
1649 if (skb->protocol != htons(ETH_P_IP)) {
1650 /* Not IP (i.e. ARP). Do not create route, if it is
1651 * invalid for proxy arp. DNAT routes are always valid.
1652 */
1653 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1654 err = -EINVAL;
1655 goto cleanup;
1656 }
1657 }
1658
1659
1660 rth = dst_alloc(&ipv4_dst_ops);
1661 if (!rth) {
1662 err = -ENOBUFS;
1663 goto cleanup;
1664 }
1665
Julian Anastasovce723d82005-09-08 13:34:47 -07001666 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001667 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07001668 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001669 rth->u.dst.flags |= DST_NOPOLICY;
Herbert Xu42f811b2007-06-04 23:34:44 -07001670 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001671 rth->u.dst.flags |= DST_NOXFRM;
1672 rth->fl.fl4_dst = daddr;
1673 rth->rt_dst = daddr;
1674 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001675 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001676 rth->fl.fl4_src = saddr;
1677 rth->rt_src = saddr;
1678 rth->rt_gateway = daddr;
1679 rth->rt_iif =
1680 rth->fl.iif = in_dev->dev->ifindex;
1681 rth->u.dst.dev = (out_dev)->dev;
1682 dev_hold(rth->u.dst.dev);
1683 rth->idev = in_dev_get(rth->u.dst.dev);
1684 rth->fl.oif = 0;
1685 rth->rt_spec_dst= spec_dst;
1686
1687 rth->u.dst.input = ip_forward;
1688 rth->u.dst.output = ip_output;
1689
1690 rt_set_nexthop(rth, res, itag);
1691
1692 rth->rt_flags = flags;
1693
1694 *result = rth;
1695 err = 0;
1696 cleanup:
1697 /* release the working reference to the output device */
1698 in_dev_put(out_dev);
1699 return err;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001700}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001701
David S. Millere06e7c62007-06-10 17:22:39 -07001702static inline int ip_mkroute_input(struct sk_buff *skb,
1703 struct fib_result* res,
1704 const struct flowi *fl,
1705 struct in_device *in_dev,
1706 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001707{
Chuck Short7abaa272005-06-22 22:10:23 -07001708 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001709 int err;
1710 unsigned hash;
1711
1712#ifdef CONFIG_IP_ROUTE_MULTIPATH
1713 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1714 fib_select_multipath(fl, res);
1715#endif
1716
1717 /* create a routing cache entry */
1718 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1719 if (err)
1720 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001721
1722 /* put it into the cache */
Al Viro8c7bc842006-09-26 21:26:19 -07001723 hash = rt_hash(daddr, saddr, fl->iif);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001724 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001725}
1726
Linus Torvalds1da177e2005-04-16 15:20:36 -07001727/*
1728 * NOTE. We drop all the packets that has local source
1729 * addresses, because every properly looped back packet
1730 * must have correct destination already attached by output routine.
1731 *
1732 * Such approach solves two big problems:
1733 * 1. Not simplex devices are handled properly.
1734 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1735 */
1736
Al Viro9e12bb22006-09-26 21:25:20 -07001737static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001738 u8 tos, struct net_device *dev)
1739{
1740 struct fib_result res;
1741 struct in_device *in_dev = in_dev_get(dev);
1742 struct flowi fl = { .nl_u = { .ip4_u =
1743 { .daddr = daddr,
1744 .saddr = saddr,
1745 .tos = tos,
1746 .scope = RT_SCOPE_UNIVERSE,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001747 } },
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001748 .mark = skb->mark,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001749 .iif = dev->ifindex };
1750 unsigned flags = 0;
1751 u32 itag = 0;
1752 struct rtable * rth;
1753 unsigned hash;
Al Viro9e12bb22006-09-26 21:25:20 -07001754 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001755 int err = -EINVAL;
1756 int free_res = 0;
1757
1758 /* IP on this device is disabled. */
1759
1760 if (!in_dev)
1761 goto out;
1762
1763 /* Check for the most weird martians, which can be not detected
1764 by fib_lookup.
1765 */
1766
1767 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1768 goto martian_source;
1769
Al Viroe4485152006-09-26 22:15:01 -07001770 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001771 goto brd_input;
1772
1773 /* Accept zero addresses only to limited broadcast;
1774 * I even do not know to fix it or not. Waiting for complains :-)
1775 */
1776 if (ZERONET(saddr))
1777 goto martian_source;
1778
1779 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1780 goto martian_destination;
1781
1782 /*
1783 * Now we are ready to route packet.
1784 */
1785 if ((err = fib_lookup(&fl, &res)) != 0) {
1786 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001787 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001788 goto no_route;
1789 }
1790 free_res = 1;
1791
1792 RT_CACHE_STAT_INC(in_slow_tot);
1793
1794 if (res.type == RTN_BROADCAST)
1795 goto brd_input;
1796
1797 if (res.type == RTN_LOCAL) {
1798 int result;
1799 result = fib_validate_source(saddr, daddr, tos,
Eric W. Biederman2774c7a2007-09-26 22:10:56 -07001800 init_net.loopback_dev->ifindex,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001801 dev, &spec_dst, &itag);
1802 if (result < 0)
1803 goto martian_source;
1804 if (result)
1805 flags |= RTCF_DIRECTSRC;
1806 spec_dst = daddr;
1807 goto local_input;
1808 }
1809
1810 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001811 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001812 if (res.type != RTN_UNICAST)
1813 goto martian_destination;
1814
1815 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1816 if (err == -ENOBUFS)
1817 goto e_nobufs;
1818 if (err == -EINVAL)
1819 goto e_inval;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001820
Linus Torvalds1da177e2005-04-16 15:20:36 -07001821done:
1822 in_dev_put(in_dev);
1823 if (free_res)
1824 fib_res_put(&res);
1825out: return err;
1826
1827brd_input:
1828 if (skb->protocol != htons(ETH_P_IP))
1829 goto e_inval;
1830
1831 if (ZERONET(saddr))
1832 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1833 else {
1834 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1835 &itag);
1836 if (err < 0)
1837 goto martian_source;
1838 if (err)
1839 flags |= RTCF_DIRECTSRC;
1840 }
1841 flags |= RTCF_BROADCAST;
1842 res.type = RTN_BROADCAST;
1843 RT_CACHE_STAT_INC(in_brd);
1844
1845local_input:
1846 rth = dst_alloc(&ipv4_dst_ops);
1847 if (!rth)
1848 goto e_nobufs;
1849
1850 rth->u.dst.output= ip_rt_bug;
1851
1852 atomic_set(&rth->u.dst.__refcnt, 1);
1853 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07001854 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001855 rth->u.dst.flags |= DST_NOPOLICY;
1856 rth->fl.fl4_dst = daddr;
1857 rth->rt_dst = daddr;
1858 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001859 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001860 rth->fl.fl4_src = saddr;
1861 rth->rt_src = saddr;
1862#ifdef CONFIG_NET_CLS_ROUTE
1863 rth->u.dst.tclassid = itag;
1864#endif
1865 rth->rt_iif =
1866 rth->fl.iif = dev->ifindex;
Eric W. Biederman2774c7a2007-09-26 22:10:56 -07001867 rth->u.dst.dev = init_net.loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001868 dev_hold(rth->u.dst.dev);
1869 rth->idev = in_dev_get(rth->u.dst.dev);
1870 rth->rt_gateway = daddr;
1871 rth->rt_spec_dst= spec_dst;
1872 rth->u.dst.input= ip_local_deliver;
1873 rth->rt_flags = flags|RTCF_LOCAL;
1874 if (res.type == RTN_UNREACHABLE) {
1875 rth->u.dst.input= ip_error;
1876 rth->u.dst.error= -err;
1877 rth->rt_flags &= ~RTCF_LOCAL;
1878 }
1879 rth->rt_type = res.type;
Al Viro8c7bc842006-09-26 21:26:19 -07001880 hash = rt_hash(daddr, saddr, fl.iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001881 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1882 goto done;
1883
1884no_route:
1885 RT_CACHE_STAT_INC(in_no_route);
1886 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1887 res.type = RTN_UNREACHABLE;
1888 goto local_input;
1889
1890 /*
1891 * Do not cache martian addresses: they should be logged (RFC1812)
1892 */
1893martian_destination:
1894 RT_CACHE_STAT_INC(in_martian_dst);
1895#ifdef CONFIG_IP_ROUTE_VERBOSE
1896 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1897 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1898 "%u.%u.%u.%u, dev %s\n",
1899 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1900#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001901
1902e_hostunreach:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001903 err = -EHOSTUNREACH;
1904 goto done;
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001905
Linus Torvalds1da177e2005-04-16 15:20:36 -07001906e_inval:
1907 err = -EINVAL;
1908 goto done;
1909
1910e_nobufs:
1911 err = -ENOBUFS;
1912 goto done;
1913
1914martian_source:
1915 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1916 goto e_inval;
1917}
1918
Al Viro9e12bb22006-09-26 21:25:20 -07001919int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001920 u8 tos, struct net_device *dev)
1921{
1922 struct rtable * rth;
1923 unsigned hash;
1924 int iif = dev->ifindex;
1925
1926 tos &= IPTOS_RT_MASK;
Al Viro8c7bc842006-09-26 21:26:19 -07001927 hash = rt_hash(daddr, saddr, iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001928
1929 rcu_read_lock();
1930 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001931 rth = rcu_dereference(rth->u.dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001932 if (rth->fl.fl4_dst == daddr &&
1933 rth->fl.fl4_src == saddr &&
1934 rth->fl.iif == iif &&
1935 rth->fl.oif == 0 &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001936 rth->fl.mark == skb->mark &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001937 rth->fl.fl4_tos == tos) {
1938 rth->u.dst.lastuse = jiffies;
1939 dst_hold(&rth->u.dst);
1940 rth->u.dst.__use++;
1941 RT_CACHE_STAT_INC(in_hit);
1942 rcu_read_unlock();
1943 skb->dst = (struct dst_entry*)rth;
1944 return 0;
1945 }
1946 RT_CACHE_STAT_INC(in_hlist_search);
1947 }
1948 rcu_read_unlock();
1949
1950 /* Multicast recognition logic is moved from route cache to here.
1951 The problem was that too many Ethernet cards have broken/missing
1952 hardware multicast filters :-( As result the host on multicasting
1953 network acquires a lot of useless route cache entries, sort of
1954 SDR messages from all the world. Now we try to get rid of them.
1955 Really, provided software IP multicast filter is organized
1956 reasonably (at least, hashed), it does not result in a slowdown
1957 comparing with route cache reject entries.
1958 Note, that multicast routers are not affected, because
1959 route cache entry is created eventually.
1960 */
1961 if (MULTICAST(daddr)) {
1962 struct in_device *in_dev;
1963
1964 rcu_read_lock();
Herbert Xue5ed6392005-10-03 14:35:55 -07001965 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001966 int our = ip_check_mc(in_dev, daddr, saddr,
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001967 ip_hdr(skb)->protocol);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001968 if (our
1969#ifdef CONFIG_IP_MROUTE
1970 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1971#endif
1972 ) {
1973 rcu_read_unlock();
1974 return ip_route_input_mc(skb, daddr, saddr,
1975 tos, dev, our);
1976 }
1977 }
1978 rcu_read_unlock();
1979 return -EINVAL;
1980 }
1981 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1982}
1983
1984static inline int __mkroute_output(struct rtable **result,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001985 struct fib_result* res,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001986 const struct flowi *fl,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001987 const struct flowi *oldflp,
1988 struct net_device *dev_out,
1989 unsigned flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001990{
1991 struct rtable *rth;
1992 struct in_device *in_dev;
1993 u32 tos = RT_FL_TOS(oldflp);
1994 int err = 0;
1995
1996 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
1997 return -EINVAL;
1998
Al Viroe4485152006-09-26 22:15:01 -07001999 if (fl->fl4_dst == htonl(0xFFFFFFFF))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002000 res->type = RTN_BROADCAST;
2001 else if (MULTICAST(fl->fl4_dst))
2002 res->type = RTN_MULTICAST;
2003 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2004 return -EINVAL;
2005
2006 if (dev_out->flags & IFF_LOOPBACK)
2007 flags |= RTCF_LOCAL;
2008
2009 /* get work reference to inet device */
2010 in_dev = in_dev_get(dev_out);
2011 if (!in_dev)
2012 return -EINVAL;
2013
2014 if (res->type == RTN_BROADCAST) {
2015 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2016 if (res->fi) {
2017 fib_info_put(res->fi);
2018 res->fi = NULL;
2019 }
2020 } else if (res->type == RTN_MULTICAST) {
2021 flags |= RTCF_MULTICAST|RTCF_LOCAL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002022 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002023 oldflp->proto))
2024 flags &= ~RTCF_LOCAL;
2025 /* If multicast route do not exist use
2026 default one, but do not gateway in this case.
2027 Yes, it is hack.
2028 */
2029 if (res->fi && res->prefixlen < 4) {
2030 fib_info_put(res->fi);
2031 res->fi = NULL;
2032 }
2033 }
2034
2035
2036 rth = dst_alloc(&ipv4_dst_ops);
2037 if (!rth) {
2038 err = -ENOBUFS;
2039 goto cleanup;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002040 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002041
Julian Anastasovce723d82005-09-08 13:34:47 -07002042 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002043 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07002044 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002045 rth->u.dst.flags |= DST_NOXFRM;
Herbert Xu42f811b2007-06-04 23:34:44 -07002046 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002047 rth->u.dst.flags |= DST_NOPOLICY;
2048
2049 rth->fl.fl4_dst = oldflp->fl4_dst;
2050 rth->fl.fl4_tos = tos;
2051 rth->fl.fl4_src = oldflp->fl4_src;
2052 rth->fl.oif = oldflp->oif;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002053 rth->fl.mark = oldflp->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002054 rth->rt_dst = fl->fl4_dst;
2055 rth->rt_src = fl->fl4_src;
2056 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002057 /* get references to the devices that are to be hold by the routing
Linus Torvalds1da177e2005-04-16 15:20:36 -07002058 cache entry */
2059 rth->u.dst.dev = dev_out;
2060 dev_hold(dev_out);
2061 rth->idev = in_dev_get(dev_out);
2062 rth->rt_gateway = fl->fl4_dst;
2063 rth->rt_spec_dst= fl->fl4_src;
2064
2065 rth->u.dst.output=ip_output;
2066
2067 RT_CACHE_STAT_INC(out_slow_tot);
2068
2069 if (flags & RTCF_LOCAL) {
2070 rth->u.dst.input = ip_local_deliver;
2071 rth->rt_spec_dst = fl->fl4_dst;
2072 }
2073 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2074 rth->rt_spec_dst = fl->fl4_src;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002075 if (flags & RTCF_LOCAL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002076 !(dev_out->flags & IFF_LOOPBACK)) {
2077 rth->u.dst.output = ip_mc_output;
2078 RT_CACHE_STAT_INC(out_slow_mc);
2079 }
2080#ifdef CONFIG_IP_MROUTE
2081 if (res->type == RTN_MULTICAST) {
2082 if (IN_DEV_MFORWARD(in_dev) &&
2083 !LOCAL_MCAST(oldflp->fl4_dst)) {
2084 rth->u.dst.input = ip_mr_input;
2085 rth->u.dst.output = ip_mc_output;
2086 }
2087 }
2088#endif
2089 }
2090
2091 rt_set_nexthop(rth, res, 0);
2092
2093 rth->rt_flags = flags;
2094
2095 *result = rth;
2096 cleanup:
2097 /* release work reference to inet device */
2098 in_dev_put(in_dev);
2099
2100 return err;
2101}
2102
David S. Millere06e7c62007-06-10 17:22:39 -07002103static inline int ip_mkroute_output(struct rtable **rp,
2104 struct fib_result* res,
2105 const struct flowi *fl,
2106 const struct flowi *oldflp,
2107 struct net_device *dev_out,
2108 unsigned flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002109{
Chuck Short7abaa272005-06-22 22:10:23 -07002110 struct rtable *rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002111 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2112 unsigned hash;
2113 if (err == 0) {
Al Viro8c7bc842006-09-26 21:26:19 -07002114 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002115 err = rt_intern_hash(hash, rth, rp);
2116 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002117
Linus Torvalds1da177e2005-04-16 15:20:36 -07002118 return err;
2119}
2120
Linus Torvalds1da177e2005-04-16 15:20:36 -07002121/*
2122 * Major route resolver routine.
2123 */
2124
2125static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2126{
2127 u32 tos = RT_FL_TOS(oldflp);
2128 struct flowi fl = { .nl_u = { .ip4_u =
2129 { .daddr = oldflp->fl4_dst,
2130 .saddr = oldflp->fl4_src,
2131 .tos = tos & IPTOS_RT_MASK,
2132 .scope = ((tos & RTO_ONLINK) ?
2133 RT_SCOPE_LINK :
2134 RT_SCOPE_UNIVERSE),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002135 } },
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002136 .mark = oldflp->mark,
Eric W. Biederman2774c7a2007-09-26 22:10:56 -07002137 .iif = init_net.loopback_dev->ifindex,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002138 .oif = oldflp->oif };
2139 struct fib_result res;
2140 unsigned flags = 0;
2141 struct net_device *dev_out = NULL;
2142 int free_res = 0;
2143 int err;
2144
2145
2146 res.fi = NULL;
2147#ifdef CONFIG_IP_MULTIPLE_TABLES
2148 res.r = NULL;
2149#endif
2150
2151 if (oldflp->fl4_src) {
2152 err = -EINVAL;
2153 if (MULTICAST(oldflp->fl4_src) ||
2154 BADCLASS(oldflp->fl4_src) ||
2155 ZERONET(oldflp->fl4_src))
2156 goto out;
2157
2158 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2159 dev_out = ip_dev_find(oldflp->fl4_src);
David S. Millerf6c5d732007-05-18 02:07:50 -07002160 if (dev_out == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002161 goto out;
2162
2163 /* I removed check for oif == dev_out->oif here.
2164 It was wrong for two reasons:
2165 1. ip_dev_find(saddr) can return wrong iface, if saddr is
2166 assigned to multiple interfaces.
2167 2. Moreover, we are allowed to send packets with saddr
2168 of another iface. --ANK
2169 */
2170
David S. Millerf6c5d732007-05-18 02:07:50 -07002171 if (oldflp->oif == 0
Al Viroe4485152006-09-26 22:15:01 -07002172 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002173 /* Special hack: user can direct multicasts
2174 and limited broadcast via necessary interface
2175 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2176 This hack is not just for fun, it allows
2177 vic,vat and friends to work.
2178 They bind socket to loopback, set ttl to zero
2179 and expect that it will work.
2180 From the viewpoint of routing cache they are broken,
2181 because we are not allowed to build multicast path
2182 with loopback source addr (look, routing cache
2183 cannot know, that ttl is zero, so that packet
2184 will not leave this host and route is valid).
2185 Luckily, this hack is good workaround.
2186 */
2187
2188 fl.oif = dev_out->ifindex;
2189 goto make_route;
2190 }
2191 if (dev_out)
2192 dev_put(dev_out);
2193 dev_out = NULL;
2194 }
2195
2196
2197 if (oldflp->oif) {
Eric W. Biederman881d9662007-09-17 11:56:21 -07002198 dev_out = dev_get_by_index(&init_net, oldflp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002199 err = -ENODEV;
2200 if (dev_out == NULL)
2201 goto out;
Herbert Xue5ed6392005-10-03 14:35:55 -07002202
2203 /* RACE: Check return value of inet_select_addr instead. */
2204 if (__in_dev_get_rtnl(dev_out) == NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002205 dev_put(dev_out);
2206 goto out; /* Wrong error code */
2207 }
2208
Al Viroe4485152006-09-26 22:15:01 -07002209 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002210 if (!fl.fl4_src)
2211 fl.fl4_src = inet_select_addr(dev_out, 0,
2212 RT_SCOPE_LINK);
2213 goto make_route;
2214 }
2215 if (!fl.fl4_src) {
2216 if (MULTICAST(oldflp->fl4_dst))
2217 fl.fl4_src = inet_select_addr(dev_out, 0,
2218 fl.fl4_scope);
2219 else if (!oldflp->fl4_dst)
2220 fl.fl4_src = inet_select_addr(dev_out, 0,
2221 RT_SCOPE_HOST);
2222 }
2223 }
2224
2225 if (!fl.fl4_dst) {
2226 fl.fl4_dst = fl.fl4_src;
2227 if (!fl.fl4_dst)
2228 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2229 if (dev_out)
2230 dev_put(dev_out);
Eric W. Biederman2774c7a2007-09-26 22:10:56 -07002231 dev_out = init_net.loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002232 dev_hold(dev_out);
Eric W. Biederman2774c7a2007-09-26 22:10:56 -07002233 fl.oif = init_net.loopback_dev->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002234 res.type = RTN_LOCAL;
2235 flags |= RTCF_LOCAL;
2236 goto make_route;
2237 }
2238
2239 if (fib_lookup(&fl, &res)) {
2240 res.fi = NULL;
2241 if (oldflp->oif) {
2242 /* Apparently, routing tables are wrong. Assume,
2243 that the destination is on link.
2244
2245 WHY? DW.
2246 Because we are allowed to send to iface
2247 even if it has NO routes and NO assigned
2248 addresses. When oif is specified, routing
2249 tables are looked up with only one purpose:
2250 to catch if destination is gatewayed, rather than
2251 direct. Moreover, if MSG_DONTROUTE is set,
2252 we send packet, ignoring both routing tables
2253 and ifaddr state. --ANK
2254
2255
2256 We could make it even if oif is unknown,
2257 likely IPv6, but we do not.
2258 */
2259
2260 if (fl.fl4_src == 0)
2261 fl.fl4_src = inet_select_addr(dev_out, 0,
2262 RT_SCOPE_LINK);
2263 res.type = RTN_UNICAST;
2264 goto make_route;
2265 }
2266 if (dev_out)
2267 dev_put(dev_out);
2268 err = -ENETUNREACH;
2269 goto out;
2270 }
2271 free_res = 1;
2272
2273 if (res.type == RTN_LOCAL) {
2274 if (!fl.fl4_src)
2275 fl.fl4_src = fl.fl4_dst;
2276 if (dev_out)
2277 dev_put(dev_out);
Eric W. Biederman2774c7a2007-09-26 22:10:56 -07002278 dev_out = init_net.loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002279 dev_hold(dev_out);
2280 fl.oif = dev_out->ifindex;
2281 if (res.fi)
2282 fib_info_put(res.fi);
2283 res.fi = NULL;
2284 flags |= RTCF_LOCAL;
2285 goto make_route;
2286 }
2287
2288#ifdef CONFIG_IP_ROUTE_MULTIPATH
2289 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2290 fib_select_multipath(&fl, &res);
2291 else
2292#endif
2293 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2294 fib_select_default(&fl, &res);
2295
2296 if (!fl.fl4_src)
2297 fl.fl4_src = FIB_RES_PREFSRC(res);
2298
2299 if (dev_out)
2300 dev_put(dev_out);
2301 dev_out = FIB_RES_DEV(res);
2302 dev_hold(dev_out);
2303 fl.oif = dev_out->ifindex;
2304
2305
2306make_route:
2307 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2308
2309
2310 if (free_res)
2311 fib_res_put(&res);
2312 if (dev_out)
2313 dev_put(dev_out);
2314out: return err;
2315}
2316
2317int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2318{
2319 unsigned hash;
2320 struct rtable *rth;
2321
Al Viro8c7bc842006-09-26 21:26:19 -07002322 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002323
2324 rcu_read_lock_bh();
2325 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08002326 rth = rcu_dereference(rth->u.dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002327 if (rth->fl.fl4_dst == flp->fl4_dst &&
2328 rth->fl.fl4_src == flp->fl4_src &&
2329 rth->fl.iif == 0 &&
2330 rth->fl.oif == flp->oif &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002331 rth->fl.mark == flp->mark &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002332 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2333 (IPTOS_RT_MASK | RTO_ONLINK))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002334 rth->u.dst.lastuse = jiffies;
2335 dst_hold(&rth->u.dst);
2336 rth->u.dst.__use++;
2337 RT_CACHE_STAT_INC(out_hit);
2338 rcu_read_unlock_bh();
2339 *rp = rth;
2340 return 0;
2341 }
2342 RT_CACHE_STAT_INC(out_hlist_search);
2343 }
2344 rcu_read_unlock_bh();
2345
2346 return ip_route_output_slow(rp, flp);
2347}
2348
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002349EXPORT_SYMBOL_GPL(__ip_route_output_key);
2350
David S. Miller14e50e52007-05-24 18:17:54 -07002351static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2352{
2353}
2354
2355static struct dst_ops ipv4_dst_blackhole_ops = {
2356 .family = AF_INET,
2357 .protocol = __constant_htons(ETH_P_IP),
2358 .destroy = ipv4_dst_destroy,
2359 .check = ipv4_dst_check,
2360 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2361 .entry_size = sizeof(struct rtable),
2362};
2363
2364
2365static int ipv4_blackhole_output(struct sk_buff *skb)
2366{
2367 kfree_skb(skb);
2368 return 0;
2369}
2370
2371static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2372{
2373 struct rtable *ort = *rp;
2374 struct rtable *rt = (struct rtable *)
2375 dst_alloc(&ipv4_dst_blackhole_ops);
2376
2377 if (rt) {
2378 struct dst_entry *new = &rt->u.dst;
2379
2380 atomic_set(&new->__refcnt, 1);
2381 new->__use = 1;
2382 new->input = ipv4_blackhole_output;
2383 new->output = ipv4_blackhole_output;
2384 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2385
2386 new->dev = ort->u.dst.dev;
2387 if (new->dev)
2388 dev_hold(new->dev);
2389
2390 rt->fl = ort->fl;
2391
2392 rt->idev = ort->idev;
2393 if (rt->idev)
2394 in_dev_hold(rt->idev);
2395 rt->rt_flags = ort->rt_flags;
2396 rt->rt_type = ort->rt_type;
2397 rt->rt_dst = ort->rt_dst;
2398 rt->rt_src = ort->rt_src;
2399 rt->rt_iif = ort->rt_iif;
2400 rt->rt_gateway = ort->rt_gateway;
2401 rt->rt_spec_dst = ort->rt_spec_dst;
2402 rt->peer = ort->peer;
2403 if (rt->peer)
2404 atomic_inc(&rt->peer->refcnt);
2405
2406 dst_free(new);
2407 }
2408
2409 dst_release(&(*rp)->u.dst);
2410 *rp = rt;
2411 return (rt ? 0 : -ENOMEM);
2412}
2413
Linus Torvalds1da177e2005-04-16 15:20:36 -07002414int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2415{
2416 int err;
2417
2418 if ((err = __ip_route_output_key(rp, flp)) != 0)
2419 return err;
2420
2421 if (flp->proto) {
2422 if (!flp->fl4_src)
2423 flp->fl4_src = (*rp)->rt_src;
2424 if (!flp->fl4_dst)
2425 flp->fl4_dst = (*rp)->rt_dst;
David S. Miller14e50e52007-05-24 18:17:54 -07002426 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2427 if (err == -EREMOTE)
2428 err = ipv4_dst_blackhole(rp, flp, sk);
2429
2430 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002431 }
2432
2433 return 0;
2434}
2435
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002436EXPORT_SYMBOL_GPL(ip_route_output_flow);
2437
Linus Torvalds1da177e2005-04-16 15:20:36 -07002438int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2439{
2440 return ip_route_output_flow(rp, flp, NULL, 0);
2441}
2442
2443static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002444 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002445{
2446 struct rtable *rt = (struct rtable*)skb->dst;
2447 struct rtmsg *r;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002448 struct nlmsghdr *nlh;
Thomas Grafe3703b32006-11-27 09:27:07 -08002449 long expires;
2450 u32 id = 0, ts = 0, tsage = 0, error;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002451
2452 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2453 if (nlh == NULL)
Patrick McHardy26932562007-01-31 23:16:40 -08002454 return -EMSGSIZE;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002455
2456 r = nlmsg_data(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002457 r->rtm_family = AF_INET;
2458 r->rtm_dst_len = 32;
2459 r->rtm_src_len = 0;
2460 r->rtm_tos = rt->fl.fl4_tos;
2461 r->rtm_table = RT_TABLE_MAIN;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002462 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002463 r->rtm_type = rt->rt_type;
2464 r->rtm_scope = RT_SCOPE_UNIVERSE;
2465 r->rtm_protocol = RTPROT_UNSPEC;
2466 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2467 if (rt->rt_flags & RTCF_NOTIFY)
2468 r->rtm_flags |= RTM_F_NOTIFY;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002469
Al Viro17fb2c62006-09-26 22:15:25 -07002470 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002471
Linus Torvalds1da177e2005-04-16 15:20:36 -07002472 if (rt->fl.fl4_src) {
2473 r->rtm_src_len = 32;
Al Viro17fb2c62006-09-26 22:15:25 -07002474 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002475 }
2476 if (rt->u.dst.dev)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002477 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002478#ifdef CONFIG_NET_CLS_ROUTE
2479 if (rt->u.dst.tclassid)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002480 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002481#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002482 if (rt->fl.iif)
Al Viro17fb2c62006-09-26 22:15:25 -07002483 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002484 else if (rt->rt_src != rt->fl.fl4_src)
Al Viro17fb2c62006-09-26 22:15:25 -07002485 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002486
Linus Torvalds1da177e2005-04-16 15:20:36 -07002487 if (rt->rt_dst != rt->rt_gateway)
Al Viro17fb2c62006-09-26 22:15:25 -07002488 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002489
Linus Torvalds1da177e2005-04-16 15:20:36 -07002490 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002491 goto nla_put_failure;
2492
Thomas Grafe3703b32006-11-27 09:27:07 -08002493 error = rt->u.dst.error;
2494 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002495 if (rt->peer) {
Thomas Grafe3703b32006-11-27 09:27:07 -08002496 id = rt->peer->ip_id_count;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002497 if (rt->peer->tcp_ts_stamp) {
Thomas Grafe3703b32006-11-27 09:27:07 -08002498 ts = rt->peer->tcp_ts;
James Morris9d729f72007-03-04 16:12:44 -08002499 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002500 }
2501 }
Thomas Grafbe403ea2006-08-17 18:15:17 -07002502
Linus Torvalds1da177e2005-04-16 15:20:36 -07002503 if (rt->fl.iif) {
2504#ifdef CONFIG_IP_MROUTE
Al Viroe4485152006-09-26 22:15:01 -07002505 __be32 dst = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002506
2507 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
Herbert Xu42f811b2007-06-04 23:34:44 -07002508 IPV4_DEVCONF_ALL(MC_FORWARDING)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002509 int err = ipmr_get_route(skb, r, nowait);
2510 if (err <= 0) {
2511 if (!nowait) {
2512 if (err == 0)
2513 return 0;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002514 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002515 } else {
2516 if (err == -EMSGSIZE)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002517 goto nla_put_failure;
Thomas Grafe3703b32006-11-27 09:27:07 -08002518 error = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002519 }
2520 }
2521 } else
2522#endif
Thomas Grafbe403ea2006-08-17 18:15:17 -07002523 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002524 }
2525
Thomas Grafe3703b32006-11-27 09:27:07 -08002526 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2527 expires, error) < 0)
2528 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002529
Thomas Grafbe403ea2006-08-17 18:15:17 -07002530 return nlmsg_end(skb, nlh);
2531
2532nla_put_failure:
Patrick McHardy26932562007-01-31 23:16:40 -08002533 nlmsg_cancel(skb, nlh);
2534 return -EMSGSIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002535}
2536
Thomas Graf63f34442007-03-22 11:55:17 -07002537static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002538{
Thomas Grafd889ce32006-08-17 18:15:44 -07002539 struct rtmsg *rtm;
2540 struct nlattr *tb[RTA_MAX+1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07002541 struct rtable *rt = NULL;
Al Viro9e12bb22006-09-26 21:25:20 -07002542 __be32 dst = 0;
2543 __be32 src = 0;
2544 u32 iif;
Thomas Grafd889ce32006-08-17 18:15:44 -07002545 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002546 struct sk_buff *skb;
2547
Thomas Grafd889ce32006-08-17 18:15:44 -07002548 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2549 if (err < 0)
2550 goto errout;
2551
2552 rtm = nlmsg_data(nlh);
2553
Linus Torvalds1da177e2005-04-16 15:20:36 -07002554 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
Thomas Grafd889ce32006-08-17 18:15:44 -07002555 if (skb == NULL) {
2556 err = -ENOBUFS;
2557 goto errout;
2558 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002559
2560 /* Reserve room for dummy headers, this skb can pass
2561 through good chunk of routing engine.
2562 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07002563 skb_reset_mac_header(skb);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07002564 skb_reset_network_header(skb);
Stephen Hemmingerd2c962b2006-04-17 17:27:11 -07002565
2566 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07002567 ip_hdr(skb)->protocol = IPPROTO_ICMP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002568 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2569
Al Viro17fb2c62006-09-26 22:15:25 -07002570 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2571 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
Thomas Grafd889ce32006-08-17 18:15:44 -07002572 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002573
2574 if (iif) {
Thomas Grafd889ce32006-08-17 18:15:44 -07002575 struct net_device *dev;
2576
Eric W. Biederman881d9662007-09-17 11:56:21 -07002577 dev = __dev_get_by_index(&init_net, iif);
Thomas Grafd889ce32006-08-17 18:15:44 -07002578 if (dev == NULL) {
2579 err = -ENODEV;
2580 goto errout_free;
2581 }
2582
Linus Torvalds1da177e2005-04-16 15:20:36 -07002583 skb->protocol = htons(ETH_P_IP);
2584 skb->dev = dev;
2585 local_bh_disable();
2586 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2587 local_bh_enable();
Thomas Grafd889ce32006-08-17 18:15:44 -07002588
2589 rt = (struct rtable*) skb->dst;
2590 if (err == 0 && rt->u.dst.error)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002591 err = -rt->u.dst.error;
2592 } else {
Thomas Grafd889ce32006-08-17 18:15:44 -07002593 struct flowi fl = {
2594 .nl_u = {
2595 .ip4_u = {
2596 .daddr = dst,
2597 .saddr = src,
2598 .tos = rtm->rtm_tos,
2599 },
2600 },
2601 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2602 };
Linus Torvalds1da177e2005-04-16 15:20:36 -07002603 err = ip_route_output_key(&rt, &fl);
2604 }
Thomas Grafd889ce32006-08-17 18:15:44 -07002605
Linus Torvalds1da177e2005-04-16 15:20:36 -07002606 if (err)
Thomas Grafd889ce32006-08-17 18:15:44 -07002607 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002608
2609 skb->dst = &rt->u.dst;
2610 if (rtm->rtm_flags & RTM_F_NOTIFY)
2611 rt->rt_flags |= RTCF_NOTIFY;
2612
Linus Torvalds1da177e2005-04-16 15:20:36 -07002613 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002614 RTM_NEWROUTE, 0, 0);
Thomas Grafd889ce32006-08-17 18:15:44 -07002615 if (err <= 0)
2616 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002617
Thomas Graf2942e902006-08-15 00:30:25 -07002618 err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
Thomas Grafd889ce32006-08-17 18:15:44 -07002619errout:
Thomas Graf2942e902006-08-15 00:30:25 -07002620 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002621
Thomas Grafd889ce32006-08-17 18:15:44 -07002622errout_free:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002623 kfree_skb(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07002624 goto errout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002625}
2626
2627int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2628{
2629 struct rtable *rt;
2630 int h, s_h;
2631 int idx, s_idx;
2632
2633 s_h = cb->args[0];
2634 s_idx = idx = cb->args[1];
2635 for (h = 0; h <= rt_hash_mask; h++) {
2636 if (h < s_h) continue;
2637 if (h > s_h)
2638 s_idx = 0;
2639 rcu_read_lock_bh();
2640 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08002641 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002642 if (idx < s_idx)
2643 continue;
2644 skb->dst = dst_clone(&rt->u.dst);
2645 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002646 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002647 1, NLM_F_MULTI) <= 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002648 dst_release(xchg(&skb->dst, NULL));
2649 rcu_read_unlock_bh();
2650 goto done;
2651 }
2652 dst_release(xchg(&skb->dst, NULL));
2653 }
2654 rcu_read_unlock_bh();
2655 }
2656
2657done:
2658 cb->args[0] = h;
2659 cb->args[1] = idx;
2660 return skb->len;
2661}
2662
2663void ip_rt_multicast_event(struct in_device *in_dev)
2664{
2665 rt_cache_flush(0);
2666}
2667
2668#ifdef CONFIG_SYSCTL
2669static int flush_delay;
2670
2671static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2672 struct file *filp, void __user *buffer,
2673 size_t *lenp, loff_t *ppos)
2674{
2675 if (write) {
2676 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2677 rt_cache_flush(flush_delay);
2678 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002679 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002680
2681 return -EINVAL;
2682}
2683
2684static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2685 int __user *name,
2686 int nlen,
2687 void __user *oldval,
2688 size_t __user *oldlenp,
2689 void __user *newval,
Alexey Dobriyan1f29bcd2006-12-10 02:19:10 -08002690 size_t newlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002691{
2692 int delay;
2693 if (newlen != sizeof(int))
2694 return -EINVAL;
2695 if (get_user(delay, (int __user *)newval))
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002696 return -EFAULT;
2697 rt_cache_flush(delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002698 return 0;
2699}
2700
2701ctl_table ipv4_route_table[] = {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002702 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002703 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2704 .procname = "flush",
2705 .data = &flush_delay,
2706 .maxlen = sizeof(int),
Dave Jones7e3e0362005-04-28 12:11:03 -07002707 .mode = 0200,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002708 .proc_handler = &ipv4_sysctl_rtcache_flush,
2709 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2710 },
2711 {
2712 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2713 .procname = "min_delay",
2714 .data = &ip_rt_min_delay,
2715 .maxlen = sizeof(int),
2716 .mode = 0644,
2717 .proc_handler = &proc_dointvec_jiffies,
2718 .strategy = &sysctl_jiffies,
2719 },
2720 {
2721 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2722 .procname = "max_delay",
2723 .data = &ip_rt_max_delay,
2724 .maxlen = sizeof(int),
2725 .mode = 0644,
2726 .proc_handler = &proc_dointvec_jiffies,
2727 .strategy = &sysctl_jiffies,
2728 },
2729 {
2730 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2731 .procname = "gc_thresh",
2732 .data = &ipv4_dst_ops.gc_thresh,
2733 .maxlen = sizeof(int),
2734 .mode = 0644,
2735 .proc_handler = &proc_dointvec,
2736 },
2737 {
2738 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2739 .procname = "max_size",
2740 .data = &ip_rt_max_size,
2741 .maxlen = sizeof(int),
2742 .mode = 0644,
2743 .proc_handler = &proc_dointvec,
2744 },
2745 {
2746 /* Deprecated. Use gc_min_interval_ms */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002747
Linus Torvalds1da177e2005-04-16 15:20:36 -07002748 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2749 .procname = "gc_min_interval",
2750 .data = &ip_rt_gc_min_interval,
2751 .maxlen = sizeof(int),
2752 .mode = 0644,
2753 .proc_handler = &proc_dointvec_jiffies,
2754 .strategy = &sysctl_jiffies,
2755 },
2756 {
2757 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2758 .procname = "gc_min_interval_ms",
2759 .data = &ip_rt_gc_min_interval,
2760 .maxlen = sizeof(int),
2761 .mode = 0644,
2762 .proc_handler = &proc_dointvec_ms_jiffies,
2763 .strategy = &sysctl_ms_jiffies,
2764 },
2765 {
2766 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2767 .procname = "gc_timeout",
2768 .data = &ip_rt_gc_timeout,
2769 .maxlen = sizeof(int),
2770 .mode = 0644,
2771 .proc_handler = &proc_dointvec_jiffies,
2772 .strategy = &sysctl_jiffies,
2773 },
2774 {
2775 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2776 .procname = "gc_interval",
2777 .data = &ip_rt_gc_interval,
2778 .maxlen = sizeof(int),
2779 .mode = 0644,
2780 .proc_handler = &proc_dointvec_jiffies,
2781 .strategy = &sysctl_jiffies,
2782 },
2783 {
2784 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2785 .procname = "redirect_load",
2786 .data = &ip_rt_redirect_load,
2787 .maxlen = sizeof(int),
2788 .mode = 0644,
2789 .proc_handler = &proc_dointvec,
2790 },
2791 {
2792 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2793 .procname = "redirect_number",
2794 .data = &ip_rt_redirect_number,
2795 .maxlen = sizeof(int),
2796 .mode = 0644,
2797 .proc_handler = &proc_dointvec,
2798 },
2799 {
2800 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2801 .procname = "redirect_silence",
2802 .data = &ip_rt_redirect_silence,
2803 .maxlen = sizeof(int),
2804 .mode = 0644,
2805 .proc_handler = &proc_dointvec,
2806 },
2807 {
2808 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2809 .procname = "error_cost",
2810 .data = &ip_rt_error_cost,
2811 .maxlen = sizeof(int),
2812 .mode = 0644,
2813 .proc_handler = &proc_dointvec,
2814 },
2815 {
2816 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2817 .procname = "error_burst",
2818 .data = &ip_rt_error_burst,
2819 .maxlen = sizeof(int),
2820 .mode = 0644,
2821 .proc_handler = &proc_dointvec,
2822 },
2823 {
2824 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
2825 .procname = "gc_elasticity",
2826 .data = &ip_rt_gc_elasticity,
2827 .maxlen = sizeof(int),
2828 .mode = 0644,
2829 .proc_handler = &proc_dointvec,
2830 },
2831 {
2832 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
2833 .procname = "mtu_expires",
2834 .data = &ip_rt_mtu_expires,
2835 .maxlen = sizeof(int),
2836 .mode = 0644,
2837 .proc_handler = &proc_dointvec_jiffies,
2838 .strategy = &sysctl_jiffies,
2839 },
2840 {
2841 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
2842 .procname = "min_pmtu",
2843 .data = &ip_rt_min_pmtu,
2844 .maxlen = sizeof(int),
2845 .mode = 0644,
2846 .proc_handler = &proc_dointvec,
2847 },
2848 {
2849 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
2850 .procname = "min_adv_mss",
2851 .data = &ip_rt_min_advmss,
2852 .maxlen = sizeof(int),
2853 .mode = 0644,
2854 .proc_handler = &proc_dointvec,
2855 },
2856 {
2857 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
2858 .procname = "secret_interval",
2859 .data = &ip_rt_secret_interval,
2860 .maxlen = sizeof(int),
2861 .mode = 0644,
2862 .proc_handler = &proc_dointvec_jiffies,
2863 .strategy = &sysctl_jiffies,
2864 },
2865 { .ctl_name = 0 }
2866};
2867#endif
2868
2869#ifdef CONFIG_NET_CLS_ROUTE
2870struct ip_rt_acct *ip_rt_acct;
2871
2872/* This code sucks. But you should have seen it before! --RR */
2873
2874/* IP route accounting ptr for this logical cpu number. */
2875#define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
2876
2877#ifdef CONFIG_PROC_FS
2878static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2879 int length, int *eof, void *data)
2880{
2881 unsigned int i;
2882
2883 if ((offset & 3) || (length & 3))
2884 return -EIO;
2885
2886 if (offset >= sizeof(struct ip_rt_acct) * 256) {
2887 *eof = 1;
2888 return 0;
2889 }
2890
2891 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2892 length = sizeof(struct ip_rt_acct) * 256 - offset;
2893 *eof = 1;
2894 }
2895
2896 offset /= sizeof(u32);
2897
2898 if (length > 0) {
2899 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
2900 u32 *dst = (u32 *) buffer;
2901
2902 /* Copy first cpu. */
2903 *start = buffer;
2904 memcpy(dst, src, length);
2905
2906 /* Add the other cpus in, one int at a time */
KAMEZAWA Hiroyuki6f912042006-04-10 22:52:50 -07002907 for_each_possible_cpu(i) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002908 unsigned int j;
2909
2910 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2911
2912 for (j = 0; j < length/4; j++)
2913 dst[j] += src[j];
2914 }
2915 }
2916 return length;
2917}
2918#endif /* CONFIG_PROC_FS */
2919#endif /* CONFIG_NET_CLS_ROUTE */
2920
2921static __initdata unsigned long rhash_entries;
2922static int __init set_rhash_entries(char *str)
2923{
2924 if (!str)
2925 return 0;
2926 rhash_entries = simple_strtoul(str, &str, 0);
2927 return 1;
2928}
2929__setup("rhash_entries=", set_rhash_entries);
2930
2931int __init ip_rt_init(void)
2932{
Eric Dumazet424c4b72005-07-05 14:58:19 -07002933 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002934
2935 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2936 (jiffies ^ (jiffies >> 7)));
2937
2938#ifdef CONFIG_NET_CLS_ROUTE
Eric Dumazet424c4b72005-07-05 14:58:19 -07002939 {
2940 int order;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002941 for (order = 0;
2942 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2943 /* NOTHING */;
2944 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2945 if (!ip_rt_acct)
2946 panic("IP: failed to allocate ip_rt_acct\n");
2947 memset(ip_rt_acct, 0, PAGE_SIZE << order);
Eric Dumazet424c4b72005-07-05 14:58:19 -07002948 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002949#endif
2950
Alexey Dobriyane5d679f2006-08-26 19:25:52 -07002951 ipv4_dst_ops.kmem_cachep =
2952 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09002953 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002954
David S. Miller14e50e52007-05-24 18:17:54 -07002955 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2956
Eric Dumazet424c4b72005-07-05 14:58:19 -07002957 rt_hash_table = (struct rt_hash_bucket *)
2958 alloc_large_system_hash("IP route cache",
2959 sizeof(struct rt_hash_bucket),
2960 rhash_entries,
2961 (num_physpages >= 128 * 1024) ?
Mike Stroyan18955cf2005-11-29 16:12:55 -08002962 15 : 17,
Kirill Korotaev8d1502d2006-08-07 20:44:22 -07002963 0,
Eric Dumazet424c4b72005-07-05 14:58:19 -07002964 &rt_hash_log,
2965 &rt_hash_mask,
2966 0);
Eric Dumazet22c047c2005-07-05 14:55:24 -07002967 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
2968 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002969
2970 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2971 ip_rt_max_size = (rt_hash_mask + 1) * 16;
2972
Linus Torvalds1da177e2005-04-16 15:20:36 -07002973 devinet_init();
2974 ip_fib_init();
2975
2976 init_timer(&rt_flush_timer);
2977 rt_flush_timer.function = rt_run_flush;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002978 init_timer(&rt_secret_timer);
2979 rt_secret_timer.function = rt_secret_rebuild;
2980
2981 /* All the timers, started at system startup tend
2982 to synchronize. Perturb it a bit.
2983 */
Eric Dumazet39c90ec2007-09-15 10:55:54 -07002984 schedule_delayed_work(&expires_work,
2985 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002986
2987 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
2988 ip_rt_secret_interval;
2989 add_timer(&rt_secret_timer);
2990
2991#ifdef CONFIG_PROC_FS
2992 {
2993 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02002994 if (!proc_net_fops_create(&init_net, "rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002995 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02002996 init_net.proc_net_stat))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002997 return -ENOMEM;
2998 }
2999 rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3000 }
3001#ifdef CONFIG_NET_CLS_ROUTE
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02003002 create_proc_read_entry("rt_acct", 0, init_net.proc_net, ip_rt_acct_read, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003003#endif
3004#endif
3005#ifdef CONFIG_XFRM
3006 xfrm_init();
3007 xfrm4_init();
3008#endif
Thomas Graf63f34442007-03-22 11:55:17 -07003009 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3010
Linus Torvalds1da177e2005-04-16 15:20:36 -07003011 return rc;
3012}
3013
3014EXPORT_SYMBOL(__ip_select_ident);
3015EXPORT_SYMBOL(ip_route_input);
3016EXPORT_SYMBOL(ip_route_output_key);