blob: baee304a3cb76127affa65fca0a4898c7028de1f [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9 *
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 *
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
41 *
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070057 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Ilia Sotnikovcef26852006-03-25 01:38:55 -080058 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
59 * Ilia Sotnikov : Removed TOS from hash calculations
Linus Torvalds1da177e2005-04-16 15:20:36 -070060 *
61 * This program is free software; you can redistribute it and/or
62 * modify it under the terms of the GNU General Public License
63 * as published by the Free Software Foundation; either version
64 * 2 of the License, or (at your option) any later version.
65 */
66
Linus Torvalds1da177e2005-04-16 15:20:36 -070067#include <linux/module.h>
68#include <asm/uaccess.h>
69#include <asm/system.h>
70#include <linux/bitops.h>
71#include <linux/types.h>
72#include <linux/kernel.h>
73#include <linux/sched.h>
74#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070075#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070076#include <linux/string.h>
77#include <linux/socket.h>
78#include <linux/sockios.h>
79#include <linux/errno.h>
80#include <linux/in.h>
81#include <linux/inet.h>
82#include <linux/netdevice.h>
83#include <linux/proc_fs.h>
84#include <linux/init.h>
85#include <linux/skbuff.h>
86#include <linux/rtnetlink.h>
87#include <linux/inetdevice.h>
88#include <linux/igmp.h>
89#include <linux/pkt_sched.h>
90#include <linux/mroute.h>
91#include <linux/netfilter_ipv4.h>
92#include <linux/random.h>
93#include <linux/jhash.h>
94#include <linux/rcupdate.h>
95#include <linux/times.h>
96#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
106#include <net/ip_mp_alg.h>
Tom Tucker8d717402006-07-30 20:43:36 -0700107#include <net/netevent.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108#ifdef CONFIG_SYSCTL
109#include <linux/sysctl.h>
110#endif
111
112#define RT_FL_TOS(oldflp) \
113 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114
115#define IP_MAX_MTU 0xFFF0
116
117#define RT_GC_TIMEOUT (300*HZ)
118
119static int ip_rt_min_delay = 2 * HZ;
120static int ip_rt_max_delay = 10 * HZ;
121static int ip_rt_max_size;
122static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
123static int ip_rt_gc_interval = 60 * HZ;
124static int ip_rt_gc_min_interval = HZ / 2;
125static int ip_rt_redirect_number = 9;
126static int ip_rt_redirect_load = HZ / 50;
127static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
128static int ip_rt_error_cost = HZ;
129static int ip_rt_error_burst = 5 * HZ;
130static int ip_rt_gc_elasticity = 8;
131static int ip_rt_mtu_expires = 10 * 60 * HZ;
132static int ip_rt_min_pmtu = 512 + 20 + 20;
133static int ip_rt_min_advmss = 256;
134static int ip_rt_secret_interval = 10 * 60 * HZ;
135static unsigned long rt_deadline;
136
137#define RTprint(a...) printk(KERN_DEBUG a)
138
139static struct timer_list rt_flush_timer;
140static struct timer_list rt_periodic_timer;
141static struct timer_list rt_secret_timer;
142
143/*
144 * Interface to generic destination cache.
145 */
146
147static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
148static void ipv4_dst_destroy(struct dst_entry *dst);
149static void ipv4_dst_ifdown(struct dst_entry *dst,
150 struct net_device *dev, int how);
151static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
152static void ipv4_link_failure(struct sk_buff *skb);
153static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
154static int rt_garbage_collect(void);
155
156
157static struct dst_ops ipv4_dst_ops = {
158 .family = AF_INET,
159 .protocol = __constant_htons(ETH_P_IP),
160 .gc = rt_garbage_collect,
161 .check = ipv4_dst_check,
162 .destroy = ipv4_dst_destroy,
163 .ifdown = ipv4_dst_ifdown,
164 .negative_advice = ipv4_negative_advice,
165 .link_failure = ipv4_link_failure,
166 .update_pmtu = ip_rt_update_pmtu,
167 .entry_size = sizeof(struct rtable),
168};
169
170#define ECN_OR_COST(class) TC_PRIO_##class
171
172__u8 ip_tos2prio[16] = {
173 TC_PRIO_BESTEFFORT,
174 ECN_OR_COST(FILLER),
175 TC_PRIO_BESTEFFORT,
176 ECN_OR_COST(BESTEFFORT),
177 TC_PRIO_BULK,
178 ECN_OR_COST(BULK),
179 TC_PRIO_BULK,
180 ECN_OR_COST(BULK),
181 TC_PRIO_INTERACTIVE,
182 ECN_OR_COST(INTERACTIVE),
183 TC_PRIO_INTERACTIVE,
184 ECN_OR_COST(INTERACTIVE),
185 TC_PRIO_INTERACTIVE_BULK,
186 ECN_OR_COST(INTERACTIVE_BULK),
187 TC_PRIO_INTERACTIVE_BULK,
188 ECN_OR_COST(INTERACTIVE_BULK)
189};
190
191
192/*
193 * Route cache.
194 */
195
196/* The locking scheme is rather straight forward:
197 *
198 * 1) Read-Copy Update protects the buckets of the central route hash.
199 * 2) Only writers remove entries, and they hold the lock
200 * as they look at rtable reference counts.
201 * 3) Only readers acquire references to rtable entries,
202 * they do so with atomic increments and with the
203 * lock held.
204 */
205
206struct rt_hash_bucket {
207 struct rtable *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700208};
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700209#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
210 defined(CONFIG_PROVE_LOCKING)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700211/*
212 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
213 * The size of this table is a power of two and depends on the number of CPUS.
Ingo Molnar62051202006-07-03 00:24:59 -0700214 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700215 */
Ingo Molnar62051202006-07-03 00:24:59 -0700216#ifdef CONFIG_LOCKDEP
217# define RT_HASH_LOCK_SZ 256
Eric Dumazet22c047c2005-07-05 14:55:24 -0700218#else
Ingo Molnar62051202006-07-03 00:24:59 -0700219# if NR_CPUS >= 32
220# define RT_HASH_LOCK_SZ 4096
221# elif NR_CPUS >= 16
222# define RT_HASH_LOCK_SZ 2048
223# elif NR_CPUS >= 8
224# define RT_HASH_LOCK_SZ 1024
225# elif NR_CPUS >= 4
226# define RT_HASH_LOCK_SZ 512
227# else
228# define RT_HASH_LOCK_SZ 256
229# endif
Eric Dumazet22c047c2005-07-05 14:55:24 -0700230#endif
231
232static spinlock_t *rt_hash_locks;
233# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
234# define rt_hash_lock_init() { \
235 int i; \
236 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
237 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
238 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
239 spin_lock_init(&rt_hash_locks[i]); \
240 }
241#else
242# define rt_hash_lock_addr(slot) NULL
243# define rt_hash_lock_init()
244#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700245
246static struct rt_hash_bucket *rt_hash_table;
247static unsigned rt_hash_mask;
248static int rt_hash_log;
249static unsigned int rt_hash_rnd;
250
Eric Dumazet2f970d82006-01-17 02:54:36 -0800251static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
Andrew Mortondbd29152006-01-17 21:58:01 -0800252#define RT_CACHE_STAT_INC(field) \
Paul Mackerrasbfe5d832006-06-25 05:47:14 -0700253 (__raw_get_cpu_var(rt_cache_stat).field++)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700254
255static int rt_intern_hash(unsigned hash, struct rtable *rth,
256 struct rtable **res);
257
Ilia Sotnikovcef26852006-03-25 01:38:55 -0800258static unsigned int rt_hash_code(u32 daddr, u32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700259{
Ilia Sotnikovcef26852006-03-25 01:38:55 -0800260 return (jhash_2words(daddr, saddr, rt_hash_rnd)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700261 & rt_hash_mask);
262}
263
Al Viro8c7bc842006-09-26 21:26:19 -0700264#define rt_hash(daddr, saddr, idx) \
265 rt_hash_code((__force u32)(__be32)(daddr),\
266 (__force u32)(__be32)(saddr) ^ ((idx) << 5))
267
Linus Torvalds1da177e2005-04-16 15:20:36 -0700268#ifdef CONFIG_PROC_FS
269struct rt_cache_iter_state {
270 int bucket;
271};
272
273static struct rtable *rt_cache_get_first(struct seq_file *seq)
274{
275 struct rtable *r = NULL;
276 struct rt_cache_iter_state *st = seq->private;
277
278 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
279 rcu_read_lock_bh();
280 r = rt_hash_table[st->bucket].chain;
281 if (r)
282 break;
283 rcu_read_unlock_bh();
284 }
285 return r;
286}
287
288static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
289{
290 struct rt_cache_iter_state *st = rcu_dereference(seq->private);
291
292 r = r->u.rt_next;
293 while (!r) {
294 rcu_read_unlock_bh();
295 if (--st->bucket < 0)
296 break;
297 rcu_read_lock_bh();
298 r = rt_hash_table[st->bucket].chain;
299 }
300 return r;
301}
302
303static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
304{
305 struct rtable *r = rt_cache_get_first(seq);
306
307 if (r)
308 while (pos && (r = rt_cache_get_next(seq, r)))
309 --pos;
310 return pos ? NULL : r;
311}
312
313static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
314{
315 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
316}
317
318static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
319{
320 struct rtable *r = NULL;
321
322 if (v == SEQ_START_TOKEN)
323 r = rt_cache_get_first(seq);
324 else
325 r = rt_cache_get_next(seq, v);
326 ++*pos;
327 return r;
328}
329
330static void rt_cache_seq_stop(struct seq_file *seq, void *v)
331{
332 if (v && v != SEQ_START_TOKEN)
333 rcu_read_unlock_bh();
334}
335
336static int rt_cache_seq_show(struct seq_file *seq, void *v)
337{
338 if (v == SEQ_START_TOKEN)
339 seq_printf(seq, "%-127s\n",
340 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
341 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
342 "HHUptod\tSpecDst");
343 else {
344 struct rtable *r = v;
345 char temp[256];
346
347 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
348 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
349 r->u.dst.dev ? r->u.dst.dev->name : "*",
350 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
351 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
352 r->u.dst.__use, 0, (unsigned long)r->rt_src,
353 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
354 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
355 dst_metric(&r->u.dst, RTAX_WINDOW),
356 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
357 dst_metric(&r->u.dst, RTAX_RTTVAR)),
358 r->fl.fl4_tos,
359 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
360 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
361 dev_queue_xmit) : 0,
362 r->rt_spec_dst);
363 seq_printf(seq, "%-127s\n", temp);
364 }
365 return 0;
366}
367
368static struct seq_operations rt_cache_seq_ops = {
369 .start = rt_cache_seq_start,
370 .next = rt_cache_seq_next,
371 .stop = rt_cache_seq_stop,
372 .show = rt_cache_seq_show,
373};
374
375static int rt_cache_seq_open(struct inode *inode, struct file *file)
376{
377 struct seq_file *seq;
378 int rc = -ENOMEM;
379 struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
380
381 if (!s)
382 goto out;
383 rc = seq_open(file, &rt_cache_seq_ops);
384 if (rc)
385 goto out_kfree;
386 seq = file->private_data;
387 seq->private = s;
388 memset(s, 0, sizeof(*s));
389out:
390 return rc;
391out_kfree:
392 kfree(s);
393 goto out;
394}
395
396static struct file_operations rt_cache_seq_fops = {
397 .owner = THIS_MODULE,
398 .open = rt_cache_seq_open,
399 .read = seq_read,
400 .llseek = seq_lseek,
401 .release = seq_release_private,
402};
403
404
405static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
406{
407 int cpu;
408
409 if (*pos == 0)
410 return SEQ_START_TOKEN;
411
412 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
413 if (!cpu_possible(cpu))
414 continue;
415 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800416 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700417 }
418 return NULL;
419}
420
421static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
422{
423 int cpu;
424
425 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
426 if (!cpu_possible(cpu))
427 continue;
428 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800429 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700430 }
431 return NULL;
432
433}
434
435static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
436{
437
438}
439
440static int rt_cpu_seq_show(struct seq_file *seq, void *v)
441{
442 struct rt_cache_stat *st = v;
443
444 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700445 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700446 return 0;
447 }
448
449 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
450 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
451 atomic_read(&ipv4_dst_ops.entries),
452 st->in_hit,
453 st->in_slow_tot,
454 st->in_slow_mc,
455 st->in_no_route,
456 st->in_brd,
457 st->in_martian_dst,
458 st->in_martian_src,
459
460 st->out_hit,
461 st->out_slow_tot,
462 st->out_slow_mc,
463
464 st->gc_total,
465 st->gc_ignored,
466 st->gc_goal_miss,
467 st->gc_dst_overflow,
468 st->in_hlist_search,
469 st->out_hlist_search
470 );
471 return 0;
472}
473
474static struct seq_operations rt_cpu_seq_ops = {
475 .start = rt_cpu_seq_start,
476 .next = rt_cpu_seq_next,
477 .stop = rt_cpu_seq_stop,
478 .show = rt_cpu_seq_show,
479};
480
481
482static int rt_cpu_seq_open(struct inode *inode, struct file *file)
483{
484 return seq_open(file, &rt_cpu_seq_ops);
485}
486
487static struct file_operations rt_cpu_seq_fops = {
488 .owner = THIS_MODULE,
489 .open = rt_cpu_seq_open,
490 .read = seq_read,
491 .llseek = seq_lseek,
492 .release = seq_release,
493};
494
495#endif /* CONFIG_PROC_FS */
496
497static __inline__ void rt_free(struct rtable *rt)
498{
499 multipath_remove(rt);
500 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
501}
502
503static __inline__ void rt_drop(struct rtable *rt)
504{
505 multipath_remove(rt);
506 ip_rt_put(rt);
507 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
508}
509
510static __inline__ int rt_fast_clean(struct rtable *rth)
511{
512 /* Kill broadcast/multicast entries very aggresively, if they
513 collide in hash table with more useful entries */
514 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
515 rth->fl.iif && rth->u.rt_next;
516}
517
518static __inline__ int rt_valuable(struct rtable *rth)
519{
520 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
521 rth->u.dst.expires;
522}
523
524static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
525{
526 unsigned long age;
527 int ret = 0;
528
529 if (atomic_read(&rth->u.dst.__refcnt))
530 goto out;
531
532 ret = 1;
533 if (rth->u.dst.expires &&
534 time_after_eq(jiffies, rth->u.dst.expires))
535 goto out;
536
537 age = jiffies - rth->u.dst.lastuse;
538 ret = 0;
539 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
540 (age <= tmo2 && rt_valuable(rth)))
541 goto out;
542 ret = 1;
543out: return ret;
544}
545
546/* Bits of score are:
547 * 31: very valuable
548 * 30: not quite useless
549 * 29..0: usage counter
550 */
551static inline u32 rt_score(struct rtable *rt)
552{
553 u32 score = jiffies - rt->u.dst.lastuse;
554
555 score = ~score & ~(3<<30);
556
557 if (rt_valuable(rt))
558 score |= (1<<31);
559
560 if (!rt->fl.iif ||
561 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
562 score |= (1<<30);
563
564 return score;
565}
566
567static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
568{
Al Viro714e85b2006-11-14 20:51:49 -0800569 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
570 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
Thomas Graf47dcf0c2006-11-09 15:20:38 -0800571 (fl1->mark ^ fl2->mark) |
David S. Miller8238b212006-10-12 00:49:15 -0700572 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
573 *(u16 *)&fl2->nl_u.ip4_u.tos) |
574 (fl1->oif ^ fl2->oif) |
575 (fl1->iif ^ fl2->iif)) == 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700576}
577
578#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
579static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
580 struct rtable *expentry,
581 int *removed_count)
582{
583 int passedexpired = 0;
584 struct rtable **nextstep = NULL;
585 struct rtable **rthp = chain_head;
586 struct rtable *rth;
587
588 if (removed_count)
589 *removed_count = 0;
590
591 while ((rth = *rthp) != NULL) {
592 if (rth == expentry)
593 passedexpired = 1;
594
595 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0 &&
596 compare_keys(&(*rthp)->fl, &expentry->fl)) {
597 if (*rthp == expentry) {
598 *rthp = rth->u.rt_next;
599 continue;
600 } else {
601 *rthp = rth->u.rt_next;
602 rt_free(rth);
603 if (removed_count)
604 ++(*removed_count);
605 }
606 } else {
607 if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
608 passedexpired && !nextstep)
609 nextstep = &rth->u.rt_next;
610
611 rthp = &rth->u.rt_next;
612 }
613 }
614
615 rt_free(expentry);
616 if (removed_count)
617 ++(*removed_count);
618
619 return nextstep;
620}
621#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
622
623
624/* This runs via a timer and thus is always in BH context. */
625static void rt_check_expire(unsigned long dummy)
626{
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700627 static unsigned int rover;
628 unsigned int i = rover, goal;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700629 struct rtable *rth, **rthp;
630 unsigned long now = jiffies;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700631 u64 mult;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700632
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700633 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
634 if (ip_rt_gc_timeout > 1)
635 do_div(mult, ip_rt_gc_timeout);
636 goal = (unsigned int)mult;
637 if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
638 for (; goal > 0; goal--) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700639 unsigned long tmo = ip_rt_gc_timeout;
640
641 i = (i + 1) & rt_hash_mask;
642 rthp = &rt_hash_table[i].chain;
643
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700644 if (*rthp == 0)
645 continue;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700646 spin_lock(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700647 while ((rth = *rthp) != NULL) {
648 if (rth->u.dst.expires) {
649 /* Entry is expired even if it is in use */
650 if (time_before_eq(now, rth->u.dst.expires)) {
651 tmo >>= 1;
652 rthp = &rth->u.rt_next;
653 continue;
654 }
655 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
656 tmo >>= 1;
657 rthp = &rth->u.rt_next;
658 continue;
659 }
660
661 /* Cleanup aged off entries. */
662#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
663 /* remove all related balanced entries if necessary */
664 if (rth->u.dst.flags & DST_BALANCED) {
665 rthp = rt_remove_balanced_route(
666 &rt_hash_table[i].chain,
667 rth, NULL);
668 if (!rthp)
669 break;
670 } else {
671 *rthp = rth->u.rt_next;
672 rt_free(rth);
673 }
674#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
675 *rthp = rth->u.rt_next;
676 rt_free(rth);
677#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
678 }
Eric Dumazet22c047c2005-07-05 14:55:24 -0700679 spin_unlock(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700680
681 /* Fallback loop breaker. */
682 if (time_after(jiffies, now))
683 break;
684 }
685 rover = i;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700686 mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700687}
688
689/* This can run from both BH and non-BH contexts, the latter
690 * in the case of a forced flush event.
691 */
692static void rt_run_flush(unsigned long dummy)
693{
694 int i;
695 struct rtable *rth, *next;
696
697 rt_deadline = 0;
698
699 get_random_bytes(&rt_hash_rnd, 4);
700
701 for (i = rt_hash_mask; i >= 0; i--) {
Eric Dumazet22c047c2005-07-05 14:55:24 -0700702 spin_lock_bh(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700703 rth = rt_hash_table[i].chain;
704 if (rth)
705 rt_hash_table[i].chain = NULL;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700706 spin_unlock_bh(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700707
708 for (; rth; rth = next) {
709 next = rth->u.rt_next;
710 rt_free(rth);
711 }
712 }
713}
714
715static DEFINE_SPINLOCK(rt_flush_lock);
716
717void rt_cache_flush(int delay)
718{
719 unsigned long now = jiffies;
720 int user_mode = !in_softirq();
721
722 if (delay < 0)
723 delay = ip_rt_min_delay;
724
725 /* flush existing multipath state*/
726 multipath_flush();
727
728 spin_lock_bh(&rt_flush_lock);
729
730 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
731 long tmo = (long)(rt_deadline - now);
732
733 /* If flush timer is already running
734 and flush request is not immediate (delay > 0):
735
736 if deadline is not achieved, prolongate timer to "delay",
737 otherwise fire it at deadline time.
738 */
739
740 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
741 tmo = 0;
742
743 if (delay > tmo)
744 delay = tmo;
745 }
746
747 if (delay <= 0) {
748 spin_unlock_bh(&rt_flush_lock);
749 rt_run_flush(0);
750 return;
751 }
752
753 if (rt_deadline == 0)
754 rt_deadline = now + ip_rt_max_delay;
755
756 mod_timer(&rt_flush_timer, now+delay);
757 spin_unlock_bh(&rt_flush_lock);
758}
759
760static void rt_secret_rebuild(unsigned long dummy)
761{
762 unsigned long now = jiffies;
763
764 rt_cache_flush(0);
765 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
766}
767
768/*
769 Short description of GC goals.
770
771 We want to build algorithm, which will keep routing cache
772 at some equilibrium point, when number of aged off entries
773 is kept approximately equal to newly generated ones.
774
775 Current expiration strength is variable "expire".
776 We try to adjust it dynamically, so that if networking
777 is idle expires is large enough to keep enough of warm entries,
778 and when load increases it reduces to limit cache size.
779 */
780
781static int rt_garbage_collect(void)
782{
783 static unsigned long expire = RT_GC_TIMEOUT;
784 static unsigned long last_gc;
785 static int rover;
786 static int equilibrium;
787 struct rtable *rth, **rthp;
788 unsigned long now = jiffies;
789 int goal;
790
791 /*
792 * Garbage collection is pretty expensive,
793 * do not make it too frequently.
794 */
795
796 RT_CACHE_STAT_INC(gc_total);
797
798 if (now - last_gc < ip_rt_gc_min_interval &&
799 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
800 RT_CACHE_STAT_INC(gc_ignored);
801 goto out;
802 }
803
804 /* Calculate number of entries, which we want to expire now. */
805 goal = atomic_read(&ipv4_dst_ops.entries) -
806 (ip_rt_gc_elasticity << rt_hash_log);
807 if (goal <= 0) {
808 if (equilibrium < ipv4_dst_ops.gc_thresh)
809 equilibrium = ipv4_dst_ops.gc_thresh;
810 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
811 if (goal > 0) {
812 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
813 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
814 }
815 } else {
816 /* We are in dangerous area. Try to reduce cache really
817 * aggressively.
818 */
819 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
820 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
821 }
822
823 if (now - last_gc >= ip_rt_gc_min_interval)
824 last_gc = now;
825
826 if (goal <= 0) {
827 equilibrium += goal;
828 goto work_done;
829 }
830
831 do {
832 int i, k;
833
834 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
835 unsigned long tmo = expire;
836
837 k = (k + 1) & rt_hash_mask;
838 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700839 spin_lock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700840 while ((rth = *rthp) != NULL) {
841 if (!rt_may_expire(rth, tmo, expire)) {
842 tmo >>= 1;
843 rthp = &rth->u.rt_next;
844 continue;
845 }
846#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
847 /* remove all related balanced entries
848 * if necessary
849 */
850 if (rth->u.dst.flags & DST_BALANCED) {
851 int r;
852
853 rthp = rt_remove_balanced_route(
Suresh Bhogavilli85259872006-02-21 13:42:22 -0800854 &rt_hash_table[k].chain,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700855 rth,
856 &r);
857 goal -= r;
858 if (!rthp)
859 break;
860 } else {
861 *rthp = rth->u.rt_next;
862 rt_free(rth);
863 goal--;
864 }
865#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
866 *rthp = rth->u.rt_next;
867 rt_free(rth);
868 goal--;
869#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
870 }
Eric Dumazet22c047c2005-07-05 14:55:24 -0700871 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700872 if (goal <= 0)
873 break;
874 }
875 rover = k;
876
877 if (goal <= 0)
878 goto work_done;
879
880 /* Goal is not achieved. We stop process if:
881
882 - if expire reduced to zero. Otherwise, expire is halfed.
883 - if table is not full.
884 - if we are called from interrupt.
885 - jiffies check is just fallback/debug loop breaker.
886 We will not spin here for long time in any case.
887 */
888
889 RT_CACHE_STAT_INC(gc_goal_miss);
890
891 if (expire == 0)
892 break;
893
894 expire >>= 1;
895#if RT_CACHE_DEBUG >= 2
896 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
897 atomic_read(&ipv4_dst_ops.entries), goal, i);
898#endif
899
900 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
901 goto out;
902 } while (!in_softirq() && time_before_eq(jiffies, now));
903
904 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
905 goto out;
906 if (net_ratelimit())
907 printk(KERN_WARNING "dst cache overflow\n");
908 RT_CACHE_STAT_INC(gc_dst_overflow);
909 return 1;
910
911work_done:
912 expire += ip_rt_gc_min_interval;
913 if (expire > ip_rt_gc_timeout ||
914 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
915 expire = ip_rt_gc_timeout;
916#if RT_CACHE_DEBUG >= 2
917 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
918 atomic_read(&ipv4_dst_ops.entries), goal, rover);
919#endif
920out: return 0;
921}
922
923static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
924{
925 struct rtable *rth, **rthp;
926 unsigned long now;
927 struct rtable *cand, **candp;
928 u32 min_score;
929 int chain_length;
930 int attempts = !in_softirq();
931
932restart:
933 chain_length = 0;
934 min_score = ~(u32)0;
935 cand = NULL;
936 candp = NULL;
937 now = jiffies;
938
939 rthp = &rt_hash_table[hash].chain;
940
Eric Dumazet22c047c2005-07-05 14:55:24 -0700941 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700942 while ((rth = *rthp) != NULL) {
943#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
944 if (!(rth->u.dst.flags & DST_BALANCED) &&
945 compare_keys(&rth->fl, &rt->fl)) {
946#else
947 if (compare_keys(&rth->fl, &rt->fl)) {
948#endif
949 /* Put it first */
950 *rthp = rth->u.rt_next;
951 /*
952 * Since lookup is lockfree, the deletion
953 * must be visible to another weakly ordered CPU before
954 * the insertion at the start of the hash chain.
955 */
956 rcu_assign_pointer(rth->u.rt_next,
957 rt_hash_table[hash].chain);
958 /*
959 * Since lookup is lockfree, the update writes
960 * must be ordered for consistency on SMP.
961 */
962 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
963
964 rth->u.dst.__use++;
965 dst_hold(&rth->u.dst);
966 rth->u.dst.lastuse = now;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700967 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700968
969 rt_drop(rt);
970 *rp = rth;
971 return 0;
972 }
973
974 if (!atomic_read(&rth->u.dst.__refcnt)) {
975 u32 score = rt_score(rth);
976
977 if (score <= min_score) {
978 cand = rth;
979 candp = rthp;
980 min_score = score;
981 }
982 }
983
984 chain_length++;
985
986 rthp = &rth->u.rt_next;
987 }
988
989 if (cand) {
990 /* ip_rt_gc_elasticity used to be average length of chain
991 * length, when exceeded gc becomes really aggressive.
992 *
993 * The second limit is less certain. At the moment it allows
994 * only 2 entries per bucket. We will see.
995 */
996 if (chain_length > ip_rt_gc_elasticity) {
997 *candp = cand->u.rt_next;
998 rt_free(cand);
999 }
1000 }
1001
1002 /* Try to bind route to arp only if it is output
1003 route or unicast forwarding path.
1004 */
1005 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1006 int err = arp_bind_neighbour(&rt->u.dst);
1007 if (err) {
Eric Dumazet22c047c2005-07-05 14:55:24 -07001008 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001009
1010 if (err != -ENOBUFS) {
1011 rt_drop(rt);
1012 return err;
1013 }
1014
1015 /* Neighbour tables are full and nothing
1016 can be released. Try to shrink route cache,
1017 it is most likely it holds some neighbour records.
1018 */
1019 if (attempts-- > 0) {
1020 int saved_elasticity = ip_rt_gc_elasticity;
1021 int saved_int = ip_rt_gc_min_interval;
1022 ip_rt_gc_elasticity = 1;
1023 ip_rt_gc_min_interval = 0;
1024 rt_garbage_collect();
1025 ip_rt_gc_min_interval = saved_int;
1026 ip_rt_gc_elasticity = saved_elasticity;
1027 goto restart;
1028 }
1029
1030 if (net_ratelimit())
1031 printk(KERN_WARNING "Neighbour table overflow.\n");
1032 rt_drop(rt);
1033 return -ENOBUFS;
1034 }
1035 }
1036
1037 rt->u.rt_next = rt_hash_table[hash].chain;
1038#if RT_CACHE_DEBUG >= 2
1039 if (rt->u.rt_next) {
1040 struct rtable *trt;
1041 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1042 NIPQUAD(rt->rt_dst));
1043 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1044 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1045 printk("\n");
1046 }
1047#endif
1048 rt_hash_table[hash].chain = rt;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001049 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001050 *rp = rt;
1051 return 0;
1052}
1053
1054void rt_bind_peer(struct rtable *rt, int create)
1055{
1056 static DEFINE_SPINLOCK(rt_peer_lock);
1057 struct inet_peer *peer;
1058
1059 peer = inet_getpeer(rt->rt_dst, create);
1060
1061 spin_lock_bh(&rt_peer_lock);
1062 if (rt->peer == NULL) {
1063 rt->peer = peer;
1064 peer = NULL;
1065 }
1066 spin_unlock_bh(&rt_peer_lock);
1067 if (peer)
1068 inet_putpeer(peer);
1069}
1070
1071/*
1072 * Peer allocation may fail only in serious out-of-memory conditions. However
1073 * we still can generate some output.
1074 * Random ID selection looks a bit dangerous because we have no chances to
1075 * select ID being unique in a reasonable period of time.
1076 * But broken packet identifier may be better than no packet at all.
1077 */
1078static void ip_select_fb_ident(struct iphdr *iph)
1079{
1080 static DEFINE_SPINLOCK(ip_fb_id_lock);
1081 static u32 ip_fallback_id;
1082 u32 salt;
1083
1084 spin_lock_bh(&ip_fb_id_lock);
Al Viroe4485152006-09-26 22:15:01 -07001085 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001086 iph->id = htons(salt & 0xFFFF);
1087 ip_fallback_id = salt;
1088 spin_unlock_bh(&ip_fb_id_lock);
1089}
1090
1091void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1092{
1093 struct rtable *rt = (struct rtable *) dst;
1094
1095 if (rt) {
1096 if (rt->peer == NULL)
1097 rt_bind_peer(rt, 1);
1098
1099 /* If peer is attached to destination, it is never detached,
1100 so that we need not to grab a lock to dereference it.
1101 */
1102 if (rt->peer) {
1103 iph->id = htons(inet_getid(rt->peer, more));
1104 return;
1105 }
1106 } else
Stephen Hemminger9c2b3322005-04-19 22:39:42 -07001107 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1108 __builtin_return_address(0));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001109
1110 ip_select_fb_ident(iph);
1111}
1112
1113static void rt_del(unsigned hash, struct rtable *rt)
1114{
1115 struct rtable **rthp;
1116
Eric Dumazet22c047c2005-07-05 14:55:24 -07001117 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001118 ip_rt_put(rt);
1119 for (rthp = &rt_hash_table[hash].chain; *rthp;
1120 rthp = &(*rthp)->u.rt_next)
1121 if (*rthp == rt) {
1122 *rthp = rt->u.rt_next;
1123 rt_free(rt);
1124 break;
1125 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001126 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001127}
1128
Al Virof7655222006-09-26 21:25:43 -07001129void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1130 __be32 saddr, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001131{
1132 int i, k;
1133 struct in_device *in_dev = in_dev_get(dev);
1134 struct rtable *rth, **rthp;
Al Virof7655222006-09-26 21:25:43 -07001135 __be32 skeys[2] = { saddr, 0 };
Linus Torvalds1da177e2005-04-16 15:20:36 -07001136 int ikeys[2] = { dev->ifindex, 0 };
Tom Tucker8d717402006-07-30 20:43:36 -07001137 struct netevent_redirect netevent;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001138
Linus Torvalds1da177e2005-04-16 15:20:36 -07001139 if (!in_dev)
1140 return;
1141
1142 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1143 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1144 goto reject_redirect;
1145
1146 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1147 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1148 goto reject_redirect;
1149 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1150 goto reject_redirect;
1151 } else {
1152 if (inet_addr_type(new_gw) != RTN_UNICAST)
1153 goto reject_redirect;
1154 }
1155
1156 for (i = 0; i < 2; i++) {
1157 for (k = 0; k < 2; k++) {
Al Viro8c7bc842006-09-26 21:26:19 -07001158 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001159
1160 rthp=&rt_hash_table[hash].chain;
1161
1162 rcu_read_lock();
1163 while ((rth = rcu_dereference(*rthp)) != NULL) {
1164 struct rtable *rt;
1165
1166 if (rth->fl.fl4_dst != daddr ||
1167 rth->fl.fl4_src != skeys[i] ||
Linus Torvalds1da177e2005-04-16 15:20:36 -07001168 rth->fl.oif != ikeys[k] ||
1169 rth->fl.iif != 0) {
1170 rthp = &rth->u.rt_next;
1171 continue;
1172 }
1173
1174 if (rth->rt_dst != daddr ||
1175 rth->rt_src != saddr ||
1176 rth->u.dst.error ||
1177 rth->rt_gateway != old_gw ||
1178 rth->u.dst.dev != dev)
1179 break;
1180
1181 dst_hold(&rth->u.dst);
1182 rcu_read_unlock();
1183
1184 rt = dst_alloc(&ipv4_dst_ops);
1185 if (rt == NULL) {
1186 ip_rt_put(rth);
1187 in_dev_put(in_dev);
1188 return;
1189 }
1190
1191 /* Copy all the information. */
1192 *rt = *rth;
1193 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1194 rt->u.dst.__use = 1;
1195 atomic_set(&rt->u.dst.__refcnt, 1);
1196 rt->u.dst.child = NULL;
1197 if (rt->u.dst.dev)
1198 dev_hold(rt->u.dst.dev);
1199 if (rt->idev)
1200 in_dev_hold(rt->idev);
1201 rt->u.dst.obsolete = 0;
1202 rt->u.dst.lastuse = jiffies;
1203 rt->u.dst.path = &rt->u.dst;
1204 rt->u.dst.neighbour = NULL;
1205 rt->u.dst.hh = NULL;
1206 rt->u.dst.xfrm = NULL;
1207
1208 rt->rt_flags |= RTCF_REDIRECTED;
1209
1210 /* Gateway is different ... */
1211 rt->rt_gateway = new_gw;
1212
1213 /* Redirect received -> path was valid */
1214 dst_confirm(&rth->u.dst);
1215
1216 if (rt->peer)
1217 atomic_inc(&rt->peer->refcnt);
1218
1219 if (arp_bind_neighbour(&rt->u.dst) ||
1220 !(rt->u.dst.neighbour->nud_state &
1221 NUD_VALID)) {
1222 if (rt->u.dst.neighbour)
1223 neigh_event_send(rt->u.dst.neighbour, NULL);
1224 ip_rt_put(rth);
1225 rt_drop(rt);
1226 goto do_next;
1227 }
Tom Tucker8d717402006-07-30 20:43:36 -07001228
1229 netevent.old = &rth->u.dst;
1230 netevent.new = &rt->u.dst;
1231 call_netevent_notifiers(NETEVENT_REDIRECT,
1232 &netevent);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001233
1234 rt_del(hash, rth);
1235 if (!rt_intern_hash(hash, rt, &rt))
1236 ip_rt_put(rt);
1237 goto do_next;
1238 }
1239 rcu_read_unlock();
1240 do_next:
1241 ;
1242 }
1243 }
1244 in_dev_put(in_dev);
1245 return;
1246
1247reject_redirect:
1248#ifdef CONFIG_IP_ROUTE_VERBOSE
1249 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1250 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1251 "%u.%u.%u.%u ignored.\n"
Ilia Sotnikovcef26852006-03-25 01:38:55 -08001252 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07001253 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
Ilia Sotnikovcef26852006-03-25 01:38:55 -08001254 NIPQUAD(saddr), NIPQUAD(daddr));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001255#endif
1256 in_dev_put(in_dev);
1257}
1258
1259static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1260{
1261 struct rtable *rt = (struct rtable*)dst;
1262 struct dst_entry *ret = dst;
1263
1264 if (rt) {
1265 if (dst->obsolete) {
1266 ip_rt_put(rt);
1267 ret = NULL;
1268 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1269 rt->u.dst.expires) {
Al Viro8c7bc842006-09-26 21:26:19 -07001270 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1271 rt->fl.oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001272#if RT_CACHE_DEBUG >= 1
1273 printk(KERN_DEBUG "ip_rt_advice: redirect to "
1274 "%u.%u.%u.%u/%02x dropped\n",
1275 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1276#endif
1277 rt_del(hash, rt);
1278 ret = NULL;
1279 }
1280 }
1281 return ret;
1282}
1283
1284/*
1285 * Algorithm:
1286 * 1. The first ip_rt_redirect_number redirects are sent
1287 * with exponential backoff, then we stop sending them at all,
1288 * assuming that the host ignores our redirects.
1289 * 2. If we did not see packets requiring redirects
1290 * during ip_rt_redirect_silence, we assume that the host
1291 * forgot redirected route and start to send redirects again.
1292 *
1293 * This algorithm is much cheaper and more intelligent than dumb load limiting
1294 * in icmp.c.
1295 *
1296 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1297 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1298 */
1299
1300void ip_rt_send_redirect(struct sk_buff *skb)
1301{
1302 struct rtable *rt = (struct rtable*)skb->dst;
1303 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1304
1305 if (!in_dev)
1306 return;
1307
1308 if (!IN_DEV_TX_REDIRECTS(in_dev))
1309 goto out;
1310
1311 /* No redirected packets during ip_rt_redirect_silence;
1312 * reset the algorithm.
1313 */
1314 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1315 rt->u.dst.rate_tokens = 0;
1316
1317 /* Too many ignored redirects; do not send anything
1318 * set u.dst.rate_last to the last seen redirected packet.
1319 */
1320 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1321 rt->u.dst.rate_last = jiffies;
1322 goto out;
1323 }
1324
1325 /* Check for load limit; set rate_last to the latest sent
1326 * redirect.
1327 */
Li Yewang14fb8a72006-12-18 00:26:35 -08001328 if (rt->u.dst.rate_tokens == 0 ||
1329 time_after(jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001330 (rt->u.dst.rate_last +
1331 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1332 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1333 rt->u.dst.rate_last = jiffies;
1334 ++rt->u.dst.rate_tokens;
1335#ifdef CONFIG_IP_ROUTE_VERBOSE
1336 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1337 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1338 net_ratelimit())
1339 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1340 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1341 NIPQUAD(rt->rt_src), rt->rt_iif,
1342 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1343#endif
1344 }
1345out:
1346 in_dev_put(in_dev);
1347}
1348
1349static int ip_error(struct sk_buff *skb)
1350{
1351 struct rtable *rt = (struct rtable*)skb->dst;
1352 unsigned long now;
1353 int code;
1354
1355 switch (rt->u.dst.error) {
1356 case EINVAL:
1357 default:
1358 goto out;
1359 case EHOSTUNREACH:
1360 code = ICMP_HOST_UNREACH;
1361 break;
1362 case ENETUNREACH:
1363 code = ICMP_NET_UNREACH;
1364 break;
1365 case EACCES:
1366 code = ICMP_PKT_FILTERED;
1367 break;
1368 }
1369
1370 now = jiffies;
1371 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1372 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1373 rt->u.dst.rate_tokens = ip_rt_error_burst;
1374 rt->u.dst.rate_last = now;
1375 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1376 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1377 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1378 }
1379
1380out: kfree_skb(skb);
1381 return 0;
1382}
1383
1384/*
1385 * The last two values are not from the RFC but
1386 * are needed for AMPRnet AX.25 paths.
1387 */
1388
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001389static const unsigned short mtu_plateau[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -07001390{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1391
1392static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1393{
1394 int i;
1395
1396 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1397 if (old_mtu > mtu_plateau[i])
1398 return mtu_plateau[i];
1399 return 68;
1400}
1401
1402unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1403{
1404 int i;
1405 unsigned short old_mtu = ntohs(iph->tot_len);
1406 struct rtable *rth;
Al Viroe4485152006-09-26 22:15:01 -07001407 __be32 skeys[2] = { iph->saddr, 0, };
1408 __be32 daddr = iph->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001409 unsigned short est_mtu = 0;
1410
1411 if (ipv4_config.no_pmtu_disc)
1412 return 0;
1413
1414 for (i = 0; i < 2; i++) {
Al Viro8c7bc842006-09-26 21:26:19 -07001415 unsigned hash = rt_hash(daddr, skeys[i], 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001416
1417 rcu_read_lock();
1418 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1419 rth = rcu_dereference(rth->u.rt_next)) {
1420 if (rth->fl.fl4_dst == daddr &&
1421 rth->fl.fl4_src == skeys[i] &&
1422 rth->rt_dst == daddr &&
1423 rth->rt_src == iph->saddr &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001424 rth->fl.iif == 0 &&
1425 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1426 unsigned short mtu = new_mtu;
1427
1428 if (new_mtu < 68 || new_mtu >= old_mtu) {
1429
1430 /* BSD 4.2 compatibility hack :-( */
1431 if (mtu == 0 &&
1432 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1433 old_mtu >= 68 + (iph->ihl << 2))
1434 old_mtu -= iph->ihl << 2;
1435
1436 mtu = guess_mtu(old_mtu);
1437 }
1438 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1439 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1440 dst_confirm(&rth->u.dst);
1441 if (mtu < ip_rt_min_pmtu) {
1442 mtu = ip_rt_min_pmtu;
1443 rth->u.dst.metrics[RTAX_LOCK-1] |=
1444 (1 << RTAX_MTU);
1445 }
1446 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1447 dst_set_expires(&rth->u.dst,
1448 ip_rt_mtu_expires);
1449 }
1450 est_mtu = mtu;
1451 }
1452 }
1453 }
1454 rcu_read_unlock();
1455 }
1456 return est_mtu ? : new_mtu;
1457}
1458
1459static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1460{
1461 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1462 !(dst_metric_locked(dst, RTAX_MTU))) {
1463 if (mtu < ip_rt_min_pmtu) {
1464 mtu = ip_rt_min_pmtu;
1465 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1466 }
1467 dst->metrics[RTAX_MTU-1] = mtu;
1468 dst_set_expires(dst, ip_rt_mtu_expires);
Tom Tucker8d717402006-07-30 20:43:36 -07001469 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001470 }
1471}
1472
1473static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1474{
1475 return NULL;
1476}
1477
1478static void ipv4_dst_destroy(struct dst_entry *dst)
1479{
1480 struct rtable *rt = (struct rtable *) dst;
1481 struct inet_peer *peer = rt->peer;
1482 struct in_device *idev = rt->idev;
1483
1484 if (peer) {
1485 rt->peer = NULL;
1486 inet_putpeer(peer);
1487 }
1488
1489 if (idev) {
1490 rt->idev = NULL;
1491 in_dev_put(idev);
1492 }
1493}
1494
1495static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1496 int how)
1497{
1498 struct rtable *rt = (struct rtable *) dst;
1499 struct in_device *idev = rt->idev;
1500 if (dev != &loopback_dev && idev && idev->dev == dev) {
1501 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1502 if (loopback_idev) {
1503 rt->idev = loopback_idev;
1504 in_dev_put(idev);
1505 }
1506 }
1507}
1508
1509static void ipv4_link_failure(struct sk_buff *skb)
1510{
1511 struct rtable *rt;
1512
1513 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1514
1515 rt = (struct rtable *) skb->dst;
1516 if (rt)
1517 dst_set_expires(&rt->u.dst, 0);
1518}
1519
1520static int ip_rt_bug(struct sk_buff *skb)
1521{
1522 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1523 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1524 skb->dev ? skb->dev->name : "?");
1525 kfree_skb(skb);
1526 return 0;
1527}
1528
1529/*
1530 We do not cache source address of outgoing interface,
1531 because it is used only by IP RR, TS and SRR options,
1532 so that it out of fast path.
1533
1534 BTW remember: "addr" is allowed to be not aligned
1535 in IP options!
1536 */
1537
1538void ip_rt_get_source(u8 *addr, struct rtable *rt)
1539{
Al Viroa61ced52006-09-26 21:27:54 -07001540 __be32 src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001541 struct fib_result res;
1542
1543 if (rt->fl.iif == 0)
1544 src = rt->rt_src;
1545 else if (fib_lookup(&rt->fl, &res) == 0) {
1546 src = FIB_RES_PREFSRC(res);
1547 fib_res_put(&res);
1548 } else
1549 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1550 RT_SCOPE_UNIVERSE);
1551 memcpy(addr, &src, 4);
1552}
1553
1554#ifdef CONFIG_NET_CLS_ROUTE
1555static void set_class_tag(struct rtable *rt, u32 tag)
1556{
1557 if (!(rt->u.dst.tclassid & 0xFFFF))
1558 rt->u.dst.tclassid |= tag & 0xFFFF;
1559 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1560 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1561}
1562#endif
1563
1564static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1565{
1566 struct fib_info *fi = res->fi;
1567
1568 if (fi) {
1569 if (FIB_RES_GW(*res) &&
1570 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1571 rt->rt_gateway = FIB_RES_GW(*res);
1572 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1573 sizeof(rt->u.dst.metrics));
1574 if (fi->fib_mtu == 0) {
1575 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1576 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1577 rt->rt_gateway != rt->rt_dst &&
1578 rt->u.dst.dev->mtu > 576)
1579 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1580 }
1581#ifdef CONFIG_NET_CLS_ROUTE
1582 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1583#endif
1584 } else
1585 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1586
1587 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1588 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1589 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1590 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1591 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1592 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1593 ip_rt_min_advmss);
1594 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1595 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1596
1597#ifdef CONFIG_NET_CLS_ROUTE
1598#ifdef CONFIG_IP_MULTIPLE_TABLES
1599 set_class_tag(rt, fib_rules_tclass(res));
1600#endif
1601 set_class_tag(rt, itag);
1602#endif
1603 rt->rt_type = res->type;
1604}
1605
Al Viro9e12bb22006-09-26 21:25:20 -07001606static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001607 u8 tos, struct net_device *dev, int our)
1608{
1609 unsigned hash;
1610 struct rtable *rth;
Al Viroa61ced52006-09-26 21:27:54 -07001611 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001612 struct in_device *in_dev = in_dev_get(dev);
1613 u32 itag = 0;
1614
1615 /* Primary sanity checks. */
1616
1617 if (in_dev == NULL)
1618 return -EINVAL;
1619
1620 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1621 skb->protocol != htons(ETH_P_IP))
1622 goto e_inval;
1623
1624 if (ZERONET(saddr)) {
1625 if (!LOCAL_MCAST(daddr))
1626 goto e_inval;
1627 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1628 } else if (fib_validate_source(saddr, 0, tos, 0,
1629 dev, &spec_dst, &itag) < 0)
1630 goto e_inval;
1631
1632 rth = dst_alloc(&ipv4_dst_ops);
1633 if (!rth)
1634 goto e_nobufs;
1635
1636 rth->u.dst.output= ip_rt_bug;
1637
1638 atomic_set(&rth->u.dst.__refcnt, 1);
1639 rth->u.dst.flags= DST_HOST;
1640 if (in_dev->cnf.no_policy)
1641 rth->u.dst.flags |= DST_NOPOLICY;
1642 rth->fl.fl4_dst = daddr;
1643 rth->rt_dst = daddr;
1644 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001645 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001646 rth->fl.fl4_src = saddr;
1647 rth->rt_src = saddr;
1648#ifdef CONFIG_NET_CLS_ROUTE
1649 rth->u.dst.tclassid = itag;
1650#endif
1651 rth->rt_iif =
1652 rth->fl.iif = dev->ifindex;
1653 rth->u.dst.dev = &loopback_dev;
1654 dev_hold(rth->u.dst.dev);
1655 rth->idev = in_dev_get(rth->u.dst.dev);
1656 rth->fl.oif = 0;
1657 rth->rt_gateway = daddr;
1658 rth->rt_spec_dst= spec_dst;
1659 rth->rt_type = RTN_MULTICAST;
1660 rth->rt_flags = RTCF_MULTICAST;
1661 if (our) {
1662 rth->u.dst.input= ip_local_deliver;
1663 rth->rt_flags |= RTCF_LOCAL;
1664 }
1665
1666#ifdef CONFIG_IP_MROUTE
1667 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1668 rth->u.dst.input = ip_mr_input;
1669#endif
1670 RT_CACHE_STAT_INC(in_slow_mc);
1671
1672 in_dev_put(in_dev);
Al Viro8c7bc842006-09-26 21:26:19 -07001673 hash = rt_hash(daddr, saddr, dev->ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001674 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1675
1676e_nobufs:
1677 in_dev_put(in_dev);
1678 return -ENOBUFS;
1679
1680e_inval:
1681 in_dev_put(in_dev);
1682 return -EINVAL;
1683}
1684
1685
1686static void ip_handle_martian_source(struct net_device *dev,
1687 struct in_device *in_dev,
1688 struct sk_buff *skb,
Al Viro9e12bb22006-09-26 21:25:20 -07001689 __be32 daddr,
1690 __be32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001691{
1692 RT_CACHE_STAT_INC(in_martian_src);
1693#ifdef CONFIG_IP_ROUTE_VERBOSE
1694 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1695 /*
1696 * RFC1812 recommendation, if source is martian,
1697 * the only hint is MAC header.
1698 */
1699 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1700 "%u.%u.%u.%u, on dev %s\n",
1701 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
Olaf Kirch0b7f22a2005-07-11 21:01:42 -07001702 if (dev->hard_header_len && skb->mac.raw) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001703 int i;
1704 unsigned char *p = skb->mac.raw;
1705 printk(KERN_WARNING "ll header: ");
1706 for (i = 0; i < dev->hard_header_len; i++, p++) {
1707 printk("%02x", *p);
1708 if (i < (dev->hard_header_len - 1))
1709 printk(":");
1710 }
1711 printk("\n");
1712 }
1713 }
1714#endif
1715}
1716
1717static inline int __mkroute_input(struct sk_buff *skb,
1718 struct fib_result* res,
1719 struct in_device *in_dev,
Al Viro9e12bb22006-09-26 21:25:20 -07001720 __be32 daddr, __be32 saddr, u32 tos,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001721 struct rtable **result)
1722{
1723
1724 struct rtable *rth;
1725 int err;
1726 struct in_device *out_dev;
1727 unsigned flags = 0;
Al Virod9c9df82006-09-26 21:28:14 -07001728 __be32 spec_dst;
1729 u32 itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001730
1731 /* get a working reference to the output device */
1732 out_dev = in_dev_get(FIB_RES_DEV(*res));
1733 if (out_dev == NULL) {
1734 if (net_ratelimit())
1735 printk(KERN_CRIT "Bug in ip_route_input" \
1736 "_slow(). Please, report\n");
1737 return -EINVAL;
1738 }
1739
1740
1741 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1742 in_dev->dev, &spec_dst, &itag);
1743 if (err < 0) {
1744 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1745 saddr);
1746
1747 err = -EINVAL;
1748 goto cleanup;
1749 }
1750
1751 if (err)
1752 flags |= RTCF_DIRECTSRC;
1753
1754 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1755 (IN_DEV_SHARED_MEDIA(out_dev) ||
1756 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1757 flags |= RTCF_DOREDIRECT;
1758
1759 if (skb->protocol != htons(ETH_P_IP)) {
1760 /* Not IP (i.e. ARP). Do not create route, if it is
1761 * invalid for proxy arp. DNAT routes are always valid.
1762 */
1763 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1764 err = -EINVAL;
1765 goto cleanup;
1766 }
1767 }
1768
1769
1770 rth = dst_alloc(&ipv4_dst_ops);
1771 if (!rth) {
1772 err = -ENOBUFS;
1773 goto cleanup;
1774 }
1775
Julian Anastasovce723d82005-09-08 13:34:47 -07001776 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001777 rth->u.dst.flags= DST_HOST;
1778#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1779 if (res->fi->fib_nhs > 1)
1780 rth->u.dst.flags |= DST_BALANCED;
1781#endif
1782 if (in_dev->cnf.no_policy)
1783 rth->u.dst.flags |= DST_NOPOLICY;
Patrick McHardy1b6651f2006-12-04 19:59:00 -08001784 if (out_dev->cnf.no_xfrm)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001785 rth->u.dst.flags |= DST_NOXFRM;
1786 rth->fl.fl4_dst = daddr;
1787 rth->rt_dst = daddr;
1788 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001789 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001790 rth->fl.fl4_src = saddr;
1791 rth->rt_src = saddr;
1792 rth->rt_gateway = daddr;
1793 rth->rt_iif =
1794 rth->fl.iif = in_dev->dev->ifindex;
1795 rth->u.dst.dev = (out_dev)->dev;
1796 dev_hold(rth->u.dst.dev);
1797 rth->idev = in_dev_get(rth->u.dst.dev);
1798 rth->fl.oif = 0;
1799 rth->rt_spec_dst= spec_dst;
1800
1801 rth->u.dst.input = ip_forward;
1802 rth->u.dst.output = ip_output;
1803
1804 rt_set_nexthop(rth, res, itag);
1805
1806 rth->rt_flags = flags;
1807
1808 *result = rth;
1809 err = 0;
1810 cleanup:
1811 /* release the working reference to the output device */
1812 in_dev_put(out_dev);
1813 return err;
1814}
1815
1816static inline int ip_mkroute_input_def(struct sk_buff *skb,
1817 struct fib_result* res,
1818 const struct flowi *fl,
1819 struct in_device *in_dev,
Al Viro9e12bb22006-09-26 21:25:20 -07001820 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001821{
Chuck Short7abaa272005-06-22 22:10:23 -07001822 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001823 int err;
1824 unsigned hash;
1825
1826#ifdef CONFIG_IP_ROUTE_MULTIPATH
1827 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1828 fib_select_multipath(fl, res);
1829#endif
1830
1831 /* create a routing cache entry */
1832 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1833 if (err)
1834 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001835
1836 /* put it into the cache */
Al Viro8c7bc842006-09-26 21:26:19 -07001837 hash = rt_hash(daddr, saddr, fl->iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001838 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1839}
1840
1841static inline int ip_mkroute_input(struct sk_buff *skb,
1842 struct fib_result* res,
1843 const struct flowi *fl,
1844 struct in_device *in_dev,
Al Viro9e12bb22006-09-26 21:25:20 -07001845 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001846{
1847#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
Julian Anastasovce723d82005-09-08 13:34:47 -07001848 struct rtable* rth = NULL, *rtres;
1849 unsigned char hop, hopcount;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001850 int err = -EINVAL;
1851 unsigned int hash;
1852
1853 if (res->fi)
1854 hopcount = res->fi->fib_nhs;
1855 else
1856 hopcount = 1;
1857
Linus Torvalds1da177e2005-04-16 15:20:36 -07001858 /* distinguish between multipath and singlepath */
1859 if (hopcount < 2)
1860 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1861 saddr, tos);
1862
1863 /* add all alternatives to the routing cache */
1864 for (hop = 0; hop < hopcount; hop++) {
1865 res->nh_sel = hop;
1866
Julian Anastasovce723d82005-09-08 13:34:47 -07001867 /* put reference to previous result */
1868 if (hop)
1869 ip_rt_put(rtres);
1870
Linus Torvalds1da177e2005-04-16 15:20:36 -07001871 /* create a routing cache entry */
1872 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1873 &rth);
1874 if (err)
1875 return err;
1876
1877 /* put it into the cache */
Al Viro8c7bc842006-09-26 21:26:19 -07001878 hash = rt_hash(daddr, saddr, fl->iif);
Julian Anastasovce723d82005-09-08 13:34:47 -07001879 err = rt_intern_hash(hash, rth, &rtres);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001880 if (err)
1881 return err;
1882
1883 /* forward hop information to multipath impl. */
1884 multipath_set_nhinfo(rth,
1885 FIB_RES_NETWORK(*res),
1886 FIB_RES_NETMASK(*res),
1887 res->prefixlen,
1888 &FIB_RES_NH(*res));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001889 }
Julian Anastasovce723d82005-09-08 13:34:47 -07001890 skb->dst = &rtres->u.dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001891 return err;
1892#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1893 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1894#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1895}
1896
1897
1898/*
1899 * NOTE. We drop all the packets that has local source
1900 * addresses, because every properly looped back packet
1901 * must have correct destination already attached by output routine.
1902 *
1903 * Such approach solves two big problems:
1904 * 1. Not simplex devices are handled properly.
1905 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1906 */
1907
Al Viro9e12bb22006-09-26 21:25:20 -07001908static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001909 u8 tos, struct net_device *dev)
1910{
1911 struct fib_result res;
1912 struct in_device *in_dev = in_dev_get(dev);
1913 struct flowi fl = { .nl_u = { .ip4_u =
1914 { .daddr = daddr,
1915 .saddr = saddr,
1916 .tos = tos,
1917 .scope = RT_SCOPE_UNIVERSE,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001918 } },
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001919 .mark = skb->mark,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001920 .iif = dev->ifindex };
1921 unsigned flags = 0;
1922 u32 itag = 0;
1923 struct rtable * rth;
1924 unsigned hash;
Al Viro9e12bb22006-09-26 21:25:20 -07001925 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001926 int err = -EINVAL;
1927 int free_res = 0;
1928
1929 /* IP on this device is disabled. */
1930
1931 if (!in_dev)
1932 goto out;
1933
1934 /* Check for the most weird martians, which can be not detected
1935 by fib_lookup.
1936 */
1937
1938 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1939 goto martian_source;
1940
Al Viroe4485152006-09-26 22:15:01 -07001941 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001942 goto brd_input;
1943
1944 /* Accept zero addresses only to limited broadcast;
1945 * I even do not know to fix it or not. Waiting for complains :-)
1946 */
1947 if (ZERONET(saddr))
1948 goto martian_source;
1949
1950 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1951 goto martian_destination;
1952
1953 /*
1954 * Now we are ready to route packet.
1955 */
1956 if ((err = fib_lookup(&fl, &res)) != 0) {
1957 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001958 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001959 goto no_route;
1960 }
1961 free_res = 1;
1962
1963 RT_CACHE_STAT_INC(in_slow_tot);
1964
1965 if (res.type == RTN_BROADCAST)
1966 goto brd_input;
1967
1968 if (res.type == RTN_LOCAL) {
1969 int result;
1970 result = fib_validate_source(saddr, daddr, tos,
1971 loopback_dev.ifindex,
1972 dev, &spec_dst, &itag);
1973 if (result < 0)
1974 goto martian_source;
1975 if (result)
1976 flags |= RTCF_DIRECTSRC;
1977 spec_dst = daddr;
1978 goto local_input;
1979 }
1980
1981 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001982 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001983 if (res.type != RTN_UNICAST)
1984 goto martian_destination;
1985
1986 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1987 if (err == -ENOBUFS)
1988 goto e_nobufs;
1989 if (err == -EINVAL)
1990 goto e_inval;
1991
1992done:
1993 in_dev_put(in_dev);
1994 if (free_res)
1995 fib_res_put(&res);
1996out: return err;
1997
1998brd_input:
1999 if (skb->protocol != htons(ETH_P_IP))
2000 goto e_inval;
2001
2002 if (ZERONET(saddr))
2003 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2004 else {
2005 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2006 &itag);
2007 if (err < 0)
2008 goto martian_source;
2009 if (err)
2010 flags |= RTCF_DIRECTSRC;
2011 }
2012 flags |= RTCF_BROADCAST;
2013 res.type = RTN_BROADCAST;
2014 RT_CACHE_STAT_INC(in_brd);
2015
2016local_input:
2017 rth = dst_alloc(&ipv4_dst_ops);
2018 if (!rth)
2019 goto e_nobufs;
2020
2021 rth->u.dst.output= ip_rt_bug;
2022
2023 atomic_set(&rth->u.dst.__refcnt, 1);
2024 rth->u.dst.flags= DST_HOST;
2025 if (in_dev->cnf.no_policy)
2026 rth->u.dst.flags |= DST_NOPOLICY;
2027 rth->fl.fl4_dst = daddr;
2028 rth->rt_dst = daddr;
2029 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002030 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002031 rth->fl.fl4_src = saddr;
2032 rth->rt_src = saddr;
2033#ifdef CONFIG_NET_CLS_ROUTE
2034 rth->u.dst.tclassid = itag;
2035#endif
2036 rth->rt_iif =
2037 rth->fl.iif = dev->ifindex;
2038 rth->u.dst.dev = &loopback_dev;
2039 dev_hold(rth->u.dst.dev);
2040 rth->idev = in_dev_get(rth->u.dst.dev);
2041 rth->rt_gateway = daddr;
2042 rth->rt_spec_dst= spec_dst;
2043 rth->u.dst.input= ip_local_deliver;
2044 rth->rt_flags = flags|RTCF_LOCAL;
2045 if (res.type == RTN_UNREACHABLE) {
2046 rth->u.dst.input= ip_error;
2047 rth->u.dst.error= -err;
2048 rth->rt_flags &= ~RTCF_LOCAL;
2049 }
2050 rth->rt_type = res.type;
Al Viro8c7bc842006-09-26 21:26:19 -07002051 hash = rt_hash(daddr, saddr, fl.iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002052 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2053 goto done;
2054
2055no_route:
2056 RT_CACHE_STAT_INC(in_no_route);
2057 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2058 res.type = RTN_UNREACHABLE;
2059 goto local_input;
2060
2061 /*
2062 * Do not cache martian addresses: they should be logged (RFC1812)
2063 */
2064martian_destination:
2065 RT_CACHE_STAT_INC(in_martian_dst);
2066#ifdef CONFIG_IP_ROUTE_VERBOSE
2067 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2068 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2069 "%u.%u.%u.%u, dev %s\n",
2070 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2071#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002072
2073e_hostunreach:
2074 err = -EHOSTUNREACH;
2075 goto done;
2076
Linus Torvalds1da177e2005-04-16 15:20:36 -07002077e_inval:
2078 err = -EINVAL;
2079 goto done;
2080
2081e_nobufs:
2082 err = -ENOBUFS;
2083 goto done;
2084
2085martian_source:
2086 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2087 goto e_inval;
2088}
2089
Al Viro9e12bb22006-09-26 21:25:20 -07002090int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002091 u8 tos, struct net_device *dev)
2092{
2093 struct rtable * rth;
2094 unsigned hash;
2095 int iif = dev->ifindex;
2096
2097 tos &= IPTOS_RT_MASK;
Al Viro8c7bc842006-09-26 21:26:19 -07002098 hash = rt_hash(daddr, saddr, iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002099
2100 rcu_read_lock();
2101 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2102 rth = rcu_dereference(rth->u.rt_next)) {
2103 if (rth->fl.fl4_dst == daddr &&
2104 rth->fl.fl4_src == saddr &&
2105 rth->fl.iif == iif &&
2106 rth->fl.oif == 0 &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002107 rth->fl.mark == skb->mark &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002108 rth->fl.fl4_tos == tos) {
2109 rth->u.dst.lastuse = jiffies;
2110 dst_hold(&rth->u.dst);
2111 rth->u.dst.__use++;
2112 RT_CACHE_STAT_INC(in_hit);
2113 rcu_read_unlock();
2114 skb->dst = (struct dst_entry*)rth;
2115 return 0;
2116 }
2117 RT_CACHE_STAT_INC(in_hlist_search);
2118 }
2119 rcu_read_unlock();
2120
2121 /* Multicast recognition logic is moved from route cache to here.
2122 The problem was that too many Ethernet cards have broken/missing
2123 hardware multicast filters :-( As result the host on multicasting
2124 network acquires a lot of useless route cache entries, sort of
2125 SDR messages from all the world. Now we try to get rid of them.
2126 Really, provided software IP multicast filter is organized
2127 reasonably (at least, hashed), it does not result in a slowdown
2128 comparing with route cache reject entries.
2129 Note, that multicast routers are not affected, because
2130 route cache entry is created eventually.
2131 */
2132 if (MULTICAST(daddr)) {
2133 struct in_device *in_dev;
2134
2135 rcu_read_lock();
Herbert Xue5ed6392005-10-03 14:35:55 -07002136 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002137 int our = ip_check_mc(in_dev, daddr, saddr,
2138 skb->nh.iph->protocol);
2139 if (our
2140#ifdef CONFIG_IP_MROUTE
2141 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2142#endif
2143 ) {
2144 rcu_read_unlock();
2145 return ip_route_input_mc(skb, daddr, saddr,
2146 tos, dev, our);
2147 }
2148 }
2149 rcu_read_unlock();
2150 return -EINVAL;
2151 }
2152 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2153}
2154
2155static inline int __mkroute_output(struct rtable **result,
2156 struct fib_result* res,
2157 const struct flowi *fl,
2158 const struct flowi *oldflp,
2159 struct net_device *dev_out,
2160 unsigned flags)
2161{
2162 struct rtable *rth;
2163 struct in_device *in_dev;
2164 u32 tos = RT_FL_TOS(oldflp);
2165 int err = 0;
2166
2167 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2168 return -EINVAL;
2169
Al Viroe4485152006-09-26 22:15:01 -07002170 if (fl->fl4_dst == htonl(0xFFFFFFFF))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002171 res->type = RTN_BROADCAST;
2172 else if (MULTICAST(fl->fl4_dst))
2173 res->type = RTN_MULTICAST;
2174 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2175 return -EINVAL;
2176
2177 if (dev_out->flags & IFF_LOOPBACK)
2178 flags |= RTCF_LOCAL;
2179
2180 /* get work reference to inet device */
2181 in_dev = in_dev_get(dev_out);
2182 if (!in_dev)
2183 return -EINVAL;
2184
2185 if (res->type == RTN_BROADCAST) {
2186 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2187 if (res->fi) {
2188 fib_info_put(res->fi);
2189 res->fi = NULL;
2190 }
2191 } else if (res->type == RTN_MULTICAST) {
2192 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2193 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2194 oldflp->proto))
2195 flags &= ~RTCF_LOCAL;
2196 /* If multicast route do not exist use
2197 default one, but do not gateway in this case.
2198 Yes, it is hack.
2199 */
2200 if (res->fi && res->prefixlen < 4) {
2201 fib_info_put(res->fi);
2202 res->fi = NULL;
2203 }
2204 }
2205
2206
2207 rth = dst_alloc(&ipv4_dst_ops);
2208 if (!rth) {
2209 err = -ENOBUFS;
2210 goto cleanup;
2211 }
2212
Julian Anastasovce723d82005-09-08 13:34:47 -07002213 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002214 rth->u.dst.flags= DST_HOST;
2215#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2216 if (res->fi) {
2217 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2218 if (res->fi->fib_nhs > 1)
2219 rth->u.dst.flags |= DST_BALANCED;
2220 }
2221#endif
2222 if (in_dev->cnf.no_xfrm)
2223 rth->u.dst.flags |= DST_NOXFRM;
2224 if (in_dev->cnf.no_policy)
2225 rth->u.dst.flags |= DST_NOPOLICY;
2226
2227 rth->fl.fl4_dst = oldflp->fl4_dst;
2228 rth->fl.fl4_tos = tos;
2229 rth->fl.fl4_src = oldflp->fl4_src;
2230 rth->fl.oif = oldflp->oif;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002231 rth->fl.mark = oldflp->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002232 rth->rt_dst = fl->fl4_dst;
2233 rth->rt_src = fl->fl4_src;
2234 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2235 /* get references to the devices that are to be hold by the routing
2236 cache entry */
2237 rth->u.dst.dev = dev_out;
2238 dev_hold(dev_out);
2239 rth->idev = in_dev_get(dev_out);
2240 rth->rt_gateway = fl->fl4_dst;
2241 rth->rt_spec_dst= fl->fl4_src;
2242
2243 rth->u.dst.output=ip_output;
2244
2245 RT_CACHE_STAT_INC(out_slow_tot);
2246
2247 if (flags & RTCF_LOCAL) {
2248 rth->u.dst.input = ip_local_deliver;
2249 rth->rt_spec_dst = fl->fl4_dst;
2250 }
2251 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2252 rth->rt_spec_dst = fl->fl4_src;
2253 if (flags & RTCF_LOCAL &&
2254 !(dev_out->flags & IFF_LOOPBACK)) {
2255 rth->u.dst.output = ip_mc_output;
2256 RT_CACHE_STAT_INC(out_slow_mc);
2257 }
2258#ifdef CONFIG_IP_MROUTE
2259 if (res->type == RTN_MULTICAST) {
2260 if (IN_DEV_MFORWARD(in_dev) &&
2261 !LOCAL_MCAST(oldflp->fl4_dst)) {
2262 rth->u.dst.input = ip_mr_input;
2263 rth->u.dst.output = ip_mc_output;
2264 }
2265 }
2266#endif
2267 }
2268
2269 rt_set_nexthop(rth, res, 0);
2270
2271 rth->rt_flags = flags;
2272
2273 *result = rth;
2274 cleanup:
2275 /* release work reference to inet device */
2276 in_dev_put(in_dev);
2277
2278 return err;
2279}
2280
2281static inline int ip_mkroute_output_def(struct rtable **rp,
2282 struct fib_result* res,
2283 const struct flowi *fl,
2284 const struct flowi *oldflp,
2285 struct net_device *dev_out,
2286 unsigned flags)
2287{
Chuck Short7abaa272005-06-22 22:10:23 -07002288 struct rtable *rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002289 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2290 unsigned hash;
2291 if (err == 0) {
Al Viro8c7bc842006-09-26 21:26:19 -07002292 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002293 err = rt_intern_hash(hash, rth, rp);
2294 }
2295
2296 return err;
2297}
2298
2299static inline int ip_mkroute_output(struct rtable** rp,
2300 struct fib_result* res,
2301 const struct flowi *fl,
2302 const struct flowi *oldflp,
2303 struct net_device *dev_out,
2304 unsigned flags)
2305{
2306#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
Linus Torvalds1da177e2005-04-16 15:20:36 -07002307 unsigned char hop;
2308 unsigned hash;
2309 int err = -EINVAL;
Chuck Short7abaa272005-06-22 22:10:23 -07002310 struct rtable *rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002311
2312 if (res->fi && res->fi->fib_nhs > 1) {
2313 unsigned char hopcount = res->fi->fib_nhs;
2314
2315 for (hop = 0; hop < hopcount; hop++) {
2316 struct net_device *dev2nexthop;
2317
2318 res->nh_sel = hop;
2319
2320 /* hold a work reference to the output device */
2321 dev2nexthop = FIB_RES_DEV(*res);
2322 dev_hold(dev2nexthop);
2323
Julian Anastasovce723d82005-09-08 13:34:47 -07002324 /* put reference to previous result */
2325 if (hop)
2326 ip_rt_put(*rp);
2327
Linus Torvalds1da177e2005-04-16 15:20:36 -07002328 err = __mkroute_output(&rth, res, fl, oldflp,
2329 dev2nexthop, flags);
2330
2331 if (err != 0)
2332 goto cleanup;
2333
Al Viro8c7bc842006-09-26 21:26:19 -07002334 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src,
2335 oldflp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002336 err = rt_intern_hash(hash, rth, rp);
2337
2338 /* forward hop information to multipath impl. */
2339 multipath_set_nhinfo(rth,
2340 FIB_RES_NETWORK(*res),
2341 FIB_RES_NETMASK(*res),
2342 res->prefixlen,
2343 &FIB_RES_NH(*res));
2344 cleanup:
2345 /* release work reference to output device */
2346 dev_put(dev2nexthop);
2347
2348 if (err != 0)
2349 return err;
2350 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002351 return err;
2352 } else {
2353 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2354 flags);
2355 }
2356#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2357 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2358#endif
2359}
2360
2361/*
2362 * Major route resolver routine.
2363 */
2364
2365static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2366{
2367 u32 tos = RT_FL_TOS(oldflp);
2368 struct flowi fl = { .nl_u = { .ip4_u =
2369 { .daddr = oldflp->fl4_dst,
2370 .saddr = oldflp->fl4_src,
2371 .tos = tos & IPTOS_RT_MASK,
2372 .scope = ((tos & RTO_ONLINK) ?
2373 RT_SCOPE_LINK :
2374 RT_SCOPE_UNIVERSE),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002375 } },
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002376 .mark = oldflp->mark,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002377 .iif = loopback_dev.ifindex,
2378 .oif = oldflp->oif };
2379 struct fib_result res;
2380 unsigned flags = 0;
2381 struct net_device *dev_out = NULL;
2382 int free_res = 0;
2383 int err;
2384
2385
2386 res.fi = NULL;
2387#ifdef CONFIG_IP_MULTIPLE_TABLES
2388 res.r = NULL;
2389#endif
2390
2391 if (oldflp->fl4_src) {
2392 err = -EINVAL;
2393 if (MULTICAST(oldflp->fl4_src) ||
2394 BADCLASS(oldflp->fl4_src) ||
2395 ZERONET(oldflp->fl4_src))
2396 goto out;
2397
2398 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2399 dev_out = ip_dev_find(oldflp->fl4_src);
2400 if (dev_out == NULL)
2401 goto out;
2402
2403 /* I removed check for oif == dev_out->oif here.
2404 It was wrong for two reasons:
2405 1. ip_dev_find(saddr) can return wrong iface, if saddr is
2406 assigned to multiple interfaces.
2407 2. Moreover, we are allowed to send packets with saddr
2408 of another iface. --ANK
2409 */
2410
2411 if (oldflp->oif == 0
Al Viroe4485152006-09-26 22:15:01 -07002412 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002413 /* Special hack: user can direct multicasts
2414 and limited broadcast via necessary interface
2415 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2416 This hack is not just for fun, it allows
2417 vic,vat and friends to work.
2418 They bind socket to loopback, set ttl to zero
2419 and expect that it will work.
2420 From the viewpoint of routing cache they are broken,
2421 because we are not allowed to build multicast path
2422 with loopback source addr (look, routing cache
2423 cannot know, that ttl is zero, so that packet
2424 will not leave this host and route is valid).
2425 Luckily, this hack is good workaround.
2426 */
2427
2428 fl.oif = dev_out->ifindex;
2429 goto make_route;
2430 }
2431 if (dev_out)
2432 dev_put(dev_out);
2433 dev_out = NULL;
2434 }
2435
2436
2437 if (oldflp->oif) {
2438 dev_out = dev_get_by_index(oldflp->oif);
2439 err = -ENODEV;
2440 if (dev_out == NULL)
2441 goto out;
Herbert Xue5ed6392005-10-03 14:35:55 -07002442
2443 /* RACE: Check return value of inet_select_addr instead. */
2444 if (__in_dev_get_rtnl(dev_out) == NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002445 dev_put(dev_out);
2446 goto out; /* Wrong error code */
2447 }
2448
Al Viroe4485152006-09-26 22:15:01 -07002449 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002450 if (!fl.fl4_src)
2451 fl.fl4_src = inet_select_addr(dev_out, 0,
2452 RT_SCOPE_LINK);
2453 goto make_route;
2454 }
2455 if (!fl.fl4_src) {
2456 if (MULTICAST(oldflp->fl4_dst))
2457 fl.fl4_src = inet_select_addr(dev_out, 0,
2458 fl.fl4_scope);
2459 else if (!oldflp->fl4_dst)
2460 fl.fl4_src = inet_select_addr(dev_out, 0,
2461 RT_SCOPE_HOST);
2462 }
2463 }
2464
2465 if (!fl.fl4_dst) {
2466 fl.fl4_dst = fl.fl4_src;
2467 if (!fl.fl4_dst)
2468 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2469 if (dev_out)
2470 dev_put(dev_out);
2471 dev_out = &loopback_dev;
2472 dev_hold(dev_out);
2473 fl.oif = loopback_dev.ifindex;
2474 res.type = RTN_LOCAL;
2475 flags |= RTCF_LOCAL;
2476 goto make_route;
2477 }
2478
2479 if (fib_lookup(&fl, &res)) {
2480 res.fi = NULL;
2481 if (oldflp->oif) {
2482 /* Apparently, routing tables are wrong. Assume,
2483 that the destination is on link.
2484
2485 WHY? DW.
2486 Because we are allowed to send to iface
2487 even if it has NO routes and NO assigned
2488 addresses. When oif is specified, routing
2489 tables are looked up with only one purpose:
2490 to catch if destination is gatewayed, rather than
2491 direct. Moreover, if MSG_DONTROUTE is set,
2492 we send packet, ignoring both routing tables
2493 and ifaddr state. --ANK
2494
2495
2496 We could make it even if oif is unknown,
2497 likely IPv6, but we do not.
2498 */
2499
2500 if (fl.fl4_src == 0)
2501 fl.fl4_src = inet_select_addr(dev_out, 0,
2502 RT_SCOPE_LINK);
2503 res.type = RTN_UNICAST;
2504 goto make_route;
2505 }
2506 if (dev_out)
2507 dev_put(dev_out);
2508 err = -ENETUNREACH;
2509 goto out;
2510 }
2511 free_res = 1;
2512
2513 if (res.type == RTN_LOCAL) {
2514 if (!fl.fl4_src)
2515 fl.fl4_src = fl.fl4_dst;
2516 if (dev_out)
2517 dev_put(dev_out);
2518 dev_out = &loopback_dev;
2519 dev_hold(dev_out);
2520 fl.oif = dev_out->ifindex;
2521 if (res.fi)
2522 fib_info_put(res.fi);
2523 res.fi = NULL;
2524 flags |= RTCF_LOCAL;
2525 goto make_route;
2526 }
2527
2528#ifdef CONFIG_IP_ROUTE_MULTIPATH
2529 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2530 fib_select_multipath(&fl, &res);
2531 else
2532#endif
2533 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2534 fib_select_default(&fl, &res);
2535
2536 if (!fl.fl4_src)
2537 fl.fl4_src = FIB_RES_PREFSRC(res);
2538
2539 if (dev_out)
2540 dev_put(dev_out);
2541 dev_out = FIB_RES_DEV(res);
2542 dev_hold(dev_out);
2543 fl.oif = dev_out->ifindex;
2544
2545
2546make_route:
2547 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2548
2549
2550 if (free_res)
2551 fib_res_put(&res);
2552 if (dev_out)
2553 dev_put(dev_out);
2554out: return err;
2555}
2556
2557int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2558{
2559 unsigned hash;
2560 struct rtable *rth;
2561
Al Viro8c7bc842006-09-26 21:26:19 -07002562 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002563
2564 rcu_read_lock_bh();
2565 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2566 rth = rcu_dereference(rth->u.rt_next)) {
2567 if (rth->fl.fl4_dst == flp->fl4_dst &&
2568 rth->fl.fl4_src == flp->fl4_src &&
2569 rth->fl.iif == 0 &&
2570 rth->fl.oif == flp->oif &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002571 rth->fl.mark == flp->mark &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002572 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2573 (IPTOS_RT_MASK | RTO_ONLINK))) {
2574
2575 /* check for multipath routes and choose one if
2576 * necessary
2577 */
2578 if (multipath_select_route(flp, rth, rp)) {
2579 dst_hold(&(*rp)->u.dst);
2580 RT_CACHE_STAT_INC(out_hit);
2581 rcu_read_unlock_bh();
2582 return 0;
2583 }
2584
2585 rth->u.dst.lastuse = jiffies;
2586 dst_hold(&rth->u.dst);
2587 rth->u.dst.__use++;
2588 RT_CACHE_STAT_INC(out_hit);
2589 rcu_read_unlock_bh();
2590 *rp = rth;
2591 return 0;
2592 }
2593 RT_CACHE_STAT_INC(out_hlist_search);
2594 }
2595 rcu_read_unlock_bh();
2596
2597 return ip_route_output_slow(rp, flp);
2598}
2599
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002600EXPORT_SYMBOL_GPL(__ip_route_output_key);
2601
Linus Torvalds1da177e2005-04-16 15:20:36 -07002602int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2603{
2604 int err;
2605
2606 if ((err = __ip_route_output_key(rp, flp)) != 0)
2607 return err;
2608
2609 if (flp->proto) {
2610 if (!flp->fl4_src)
2611 flp->fl4_src = (*rp)->rt_src;
2612 if (!flp->fl4_dst)
2613 flp->fl4_dst = (*rp)->rt_dst;
2614 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2615 }
2616
2617 return 0;
2618}
2619
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002620EXPORT_SYMBOL_GPL(ip_route_output_flow);
2621
Linus Torvalds1da177e2005-04-16 15:20:36 -07002622int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2623{
2624 return ip_route_output_flow(rp, flp, NULL, 0);
2625}
2626
2627static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002628 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002629{
2630 struct rtable *rt = (struct rtable*)skb->dst;
2631 struct rtmsg *r;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002632 struct nlmsghdr *nlh;
Thomas Grafe3703b32006-11-27 09:27:07 -08002633 long expires;
2634 u32 id = 0, ts = 0, tsage = 0, error;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002635
2636 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2637 if (nlh == NULL)
Patrick McHardy26932562007-01-31 23:16:40 -08002638 return -EMSGSIZE;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002639
2640 r = nlmsg_data(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002641 r->rtm_family = AF_INET;
2642 r->rtm_dst_len = 32;
2643 r->rtm_src_len = 0;
2644 r->rtm_tos = rt->fl.fl4_tos;
2645 r->rtm_table = RT_TABLE_MAIN;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002646 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002647 r->rtm_type = rt->rt_type;
2648 r->rtm_scope = RT_SCOPE_UNIVERSE;
2649 r->rtm_protocol = RTPROT_UNSPEC;
2650 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2651 if (rt->rt_flags & RTCF_NOTIFY)
2652 r->rtm_flags |= RTM_F_NOTIFY;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002653
Al Viro17fb2c62006-09-26 22:15:25 -07002654 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002655
Linus Torvalds1da177e2005-04-16 15:20:36 -07002656 if (rt->fl.fl4_src) {
2657 r->rtm_src_len = 32;
Al Viro17fb2c62006-09-26 22:15:25 -07002658 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002659 }
2660 if (rt->u.dst.dev)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002661 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002662#ifdef CONFIG_NET_CLS_ROUTE
2663 if (rt->u.dst.tclassid)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002664 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002665#endif
2666#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
Thomas Grafbe403ea2006-08-17 18:15:17 -07002667 if (rt->rt_multipath_alg != IP_MP_ALG_NONE)
2668 NLA_PUT_U32(skb, RTA_MP_ALGO, rt->rt_multipath_alg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002669#endif
2670 if (rt->fl.iif)
Al Viro17fb2c62006-09-26 22:15:25 -07002671 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002672 else if (rt->rt_src != rt->fl.fl4_src)
Al Viro17fb2c62006-09-26 22:15:25 -07002673 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002674
Linus Torvalds1da177e2005-04-16 15:20:36 -07002675 if (rt->rt_dst != rt->rt_gateway)
Al Viro17fb2c62006-09-26 22:15:25 -07002676 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002677
Linus Torvalds1da177e2005-04-16 15:20:36 -07002678 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002679 goto nla_put_failure;
2680
Thomas Grafe3703b32006-11-27 09:27:07 -08002681 error = rt->u.dst.error;
2682 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002683 if (rt->peer) {
Thomas Grafe3703b32006-11-27 09:27:07 -08002684 id = rt->peer->ip_id_count;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002685 if (rt->peer->tcp_ts_stamp) {
Thomas Grafe3703b32006-11-27 09:27:07 -08002686 ts = rt->peer->tcp_ts;
2687 tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002688 }
2689 }
Thomas Grafbe403ea2006-08-17 18:15:17 -07002690
Linus Torvalds1da177e2005-04-16 15:20:36 -07002691 if (rt->fl.iif) {
2692#ifdef CONFIG_IP_MROUTE
Al Viroe4485152006-09-26 22:15:01 -07002693 __be32 dst = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002694
2695 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2696 ipv4_devconf.mc_forwarding) {
2697 int err = ipmr_get_route(skb, r, nowait);
2698 if (err <= 0) {
2699 if (!nowait) {
2700 if (err == 0)
2701 return 0;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002702 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002703 } else {
2704 if (err == -EMSGSIZE)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002705 goto nla_put_failure;
Thomas Grafe3703b32006-11-27 09:27:07 -08002706 error = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002707 }
2708 }
2709 } else
2710#endif
Thomas Grafbe403ea2006-08-17 18:15:17 -07002711 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002712 }
2713
Thomas Grafe3703b32006-11-27 09:27:07 -08002714 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2715 expires, error) < 0)
2716 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002717
Thomas Grafbe403ea2006-08-17 18:15:17 -07002718 return nlmsg_end(skb, nlh);
2719
2720nla_put_failure:
Patrick McHardy26932562007-01-31 23:16:40 -08002721 nlmsg_cancel(skb, nlh);
2722 return -EMSGSIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002723}
2724
2725int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2726{
Thomas Grafd889ce32006-08-17 18:15:44 -07002727 struct rtmsg *rtm;
2728 struct nlattr *tb[RTA_MAX+1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07002729 struct rtable *rt = NULL;
Al Viro9e12bb22006-09-26 21:25:20 -07002730 __be32 dst = 0;
2731 __be32 src = 0;
2732 u32 iif;
Thomas Grafd889ce32006-08-17 18:15:44 -07002733 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002734 struct sk_buff *skb;
2735
Thomas Grafd889ce32006-08-17 18:15:44 -07002736 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2737 if (err < 0)
2738 goto errout;
2739
2740 rtm = nlmsg_data(nlh);
2741
Linus Torvalds1da177e2005-04-16 15:20:36 -07002742 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
Thomas Grafd889ce32006-08-17 18:15:44 -07002743 if (skb == NULL) {
2744 err = -ENOBUFS;
2745 goto errout;
2746 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002747
2748 /* Reserve room for dummy headers, this skb can pass
2749 through good chunk of routing engine.
2750 */
Stephen Hemmingerd2c962b2006-04-17 17:27:11 -07002751 skb->mac.raw = skb->nh.raw = skb->data;
2752
2753 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2754 skb->nh.iph->protocol = IPPROTO_ICMP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002755 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2756
Al Viro17fb2c62006-09-26 22:15:25 -07002757 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2758 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
Thomas Grafd889ce32006-08-17 18:15:44 -07002759 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002760
2761 if (iif) {
Thomas Grafd889ce32006-08-17 18:15:44 -07002762 struct net_device *dev;
2763
2764 dev = __dev_get_by_index(iif);
2765 if (dev == NULL) {
2766 err = -ENODEV;
2767 goto errout_free;
2768 }
2769
Linus Torvalds1da177e2005-04-16 15:20:36 -07002770 skb->protocol = htons(ETH_P_IP);
2771 skb->dev = dev;
2772 local_bh_disable();
2773 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2774 local_bh_enable();
Thomas Grafd889ce32006-08-17 18:15:44 -07002775
2776 rt = (struct rtable*) skb->dst;
2777 if (err == 0 && rt->u.dst.error)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002778 err = -rt->u.dst.error;
2779 } else {
Thomas Grafd889ce32006-08-17 18:15:44 -07002780 struct flowi fl = {
2781 .nl_u = {
2782 .ip4_u = {
2783 .daddr = dst,
2784 .saddr = src,
2785 .tos = rtm->rtm_tos,
2786 },
2787 },
2788 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2789 };
Linus Torvalds1da177e2005-04-16 15:20:36 -07002790 err = ip_route_output_key(&rt, &fl);
2791 }
Thomas Grafd889ce32006-08-17 18:15:44 -07002792
Linus Torvalds1da177e2005-04-16 15:20:36 -07002793 if (err)
Thomas Grafd889ce32006-08-17 18:15:44 -07002794 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002795
2796 skb->dst = &rt->u.dst;
2797 if (rtm->rtm_flags & RTM_F_NOTIFY)
2798 rt->rt_flags |= RTCF_NOTIFY;
2799
Linus Torvalds1da177e2005-04-16 15:20:36 -07002800 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002801 RTM_NEWROUTE, 0, 0);
Thomas Grafd889ce32006-08-17 18:15:44 -07002802 if (err <= 0)
2803 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002804
Thomas Graf2942e902006-08-15 00:30:25 -07002805 err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
Thomas Grafd889ce32006-08-17 18:15:44 -07002806errout:
Thomas Graf2942e902006-08-15 00:30:25 -07002807 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002808
Thomas Grafd889ce32006-08-17 18:15:44 -07002809errout_free:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002810 kfree_skb(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07002811 goto errout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002812}
2813
2814int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2815{
2816 struct rtable *rt;
2817 int h, s_h;
2818 int idx, s_idx;
2819
2820 s_h = cb->args[0];
2821 s_idx = idx = cb->args[1];
2822 for (h = 0; h <= rt_hash_mask; h++) {
2823 if (h < s_h) continue;
2824 if (h > s_h)
2825 s_idx = 0;
2826 rcu_read_lock_bh();
2827 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2828 rt = rcu_dereference(rt->u.rt_next), idx++) {
2829 if (idx < s_idx)
2830 continue;
2831 skb->dst = dst_clone(&rt->u.dst);
2832 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002833 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2834 1, NLM_F_MULTI) <= 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002835 dst_release(xchg(&skb->dst, NULL));
2836 rcu_read_unlock_bh();
2837 goto done;
2838 }
2839 dst_release(xchg(&skb->dst, NULL));
2840 }
2841 rcu_read_unlock_bh();
2842 }
2843
2844done:
2845 cb->args[0] = h;
2846 cb->args[1] = idx;
2847 return skb->len;
2848}
2849
2850void ip_rt_multicast_event(struct in_device *in_dev)
2851{
2852 rt_cache_flush(0);
2853}
2854
2855#ifdef CONFIG_SYSCTL
2856static int flush_delay;
2857
2858static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2859 struct file *filp, void __user *buffer,
2860 size_t *lenp, loff_t *ppos)
2861{
2862 if (write) {
2863 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2864 rt_cache_flush(flush_delay);
2865 return 0;
2866 }
2867
2868 return -EINVAL;
2869}
2870
2871static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2872 int __user *name,
2873 int nlen,
2874 void __user *oldval,
2875 size_t __user *oldlenp,
2876 void __user *newval,
Alexey Dobriyan1f29bcd2006-12-10 02:19:10 -08002877 size_t newlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002878{
2879 int delay;
2880 if (newlen != sizeof(int))
2881 return -EINVAL;
2882 if (get_user(delay, (int __user *)newval))
2883 return -EFAULT;
2884 rt_cache_flush(delay);
2885 return 0;
2886}
2887
2888ctl_table ipv4_route_table[] = {
2889 {
2890 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2891 .procname = "flush",
2892 .data = &flush_delay,
2893 .maxlen = sizeof(int),
Dave Jones7e3e0362005-04-28 12:11:03 -07002894 .mode = 0200,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002895 .proc_handler = &ipv4_sysctl_rtcache_flush,
2896 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2897 },
2898 {
2899 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2900 .procname = "min_delay",
2901 .data = &ip_rt_min_delay,
2902 .maxlen = sizeof(int),
2903 .mode = 0644,
2904 .proc_handler = &proc_dointvec_jiffies,
2905 .strategy = &sysctl_jiffies,
2906 },
2907 {
2908 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2909 .procname = "max_delay",
2910 .data = &ip_rt_max_delay,
2911 .maxlen = sizeof(int),
2912 .mode = 0644,
2913 .proc_handler = &proc_dointvec_jiffies,
2914 .strategy = &sysctl_jiffies,
2915 },
2916 {
2917 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2918 .procname = "gc_thresh",
2919 .data = &ipv4_dst_ops.gc_thresh,
2920 .maxlen = sizeof(int),
2921 .mode = 0644,
2922 .proc_handler = &proc_dointvec,
2923 },
2924 {
2925 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2926 .procname = "max_size",
2927 .data = &ip_rt_max_size,
2928 .maxlen = sizeof(int),
2929 .mode = 0644,
2930 .proc_handler = &proc_dointvec,
2931 },
2932 {
2933 /* Deprecated. Use gc_min_interval_ms */
2934
2935 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2936 .procname = "gc_min_interval",
2937 .data = &ip_rt_gc_min_interval,
2938 .maxlen = sizeof(int),
2939 .mode = 0644,
2940 .proc_handler = &proc_dointvec_jiffies,
2941 .strategy = &sysctl_jiffies,
2942 },
2943 {
2944 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2945 .procname = "gc_min_interval_ms",
2946 .data = &ip_rt_gc_min_interval,
2947 .maxlen = sizeof(int),
2948 .mode = 0644,
2949 .proc_handler = &proc_dointvec_ms_jiffies,
2950 .strategy = &sysctl_ms_jiffies,
2951 },
2952 {
2953 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2954 .procname = "gc_timeout",
2955 .data = &ip_rt_gc_timeout,
2956 .maxlen = sizeof(int),
2957 .mode = 0644,
2958 .proc_handler = &proc_dointvec_jiffies,
2959 .strategy = &sysctl_jiffies,
2960 },
2961 {
2962 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2963 .procname = "gc_interval",
2964 .data = &ip_rt_gc_interval,
2965 .maxlen = sizeof(int),
2966 .mode = 0644,
2967 .proc_handler = &proc_dointvec_jiffies,
2968 .strategy = &sysctl_jiffies,
2969 },
2970 {
2971 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2972 .procname = "redirect_load",
2973 .data = &ip_rt_redirect_load,
2974 .maxlen = sizeof(int),
2975 .mode = 0644,
2976 .proc_handler = &proc_dointvec,
2977 },
2978 {
2979 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2980 .procname = "redirect_number",
2981 .data = &ip_rt_redirect_number,
2982 .maxlen = sizeof(int),
2983 .mode = 0644,
2984 .proc_handler = &proc_dointvec,
2985 },
2986 {
2987 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2988 .procname = "redirect_silence",
2989 .data = &ip_rt_redirect_silence,
2990 .maxlen = sizeof(int),
2991 .mode = 0644,
2992 .proc_handler = &proc_dointvec,
2993 },
2994 {
2995 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2996 .procname = "error_cost",
2997 .data = &ip_rt_error_cost,
2998 .maxlen = sizeof(int),
2999 .mode = 0644,
3000 .proc_handler = &proc_dointvec,
3001 },
3002 {
3003 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
3004 .procname = "error_burst",
3005 .data = &ip_rt_error_burst,
3006 .maxlen = sizeof(int),
3007 .mode = 0644,
3008 .proc_handler = &proc_dointvec,
3009 },
3010 {
3011 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
3012 .procname = "gc_elasticity",
3013 .data = &ip_rt_gc_elasticity,
3014 .maxlen = sizeof(int),
3015 .mode = 0644,
3016 .proc_handler = &proc_dointvec,
3017 },
3018 {
3019 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
3020 .procname = "mtu_expires",
3021 .data = &ip_rt_mtu_expires,
3022 .maxlen = sizeof(int),
3023 .mode = 0644,
3024 .proc_handler = &proc_dointvec_jiffies,
3025 .strategy = &sysctl_jiffies,
3026 },
3027 {
3028 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
3029 .procname = "min_pmtu",
3030 .data = &ip_rt_min_pmtu,
3031 .maxlen = sizeof(int),
3032 .mode = 0644,
3033 .proc_handler = &proc_dointvec,
3034 },
3035 {
3036 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
3037 .procname = "min_adv_mss",
3038 .data = &ip_rt_min_advmss,
3039 .maxlen = sizeof(int),
3040 .mode = 0644,
3041 .proc_handler = &proc_dointvec,
3042 },
3043 {
3044 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3045 .procname = "secret_interval",
3046 .data = &ip_rt_secret_interval,
3047 .maxlen = sizeof(int),
3048 .mode = 0644,
3049 .proc_handler = &proc_dointvec_jiffies,
3050 .strategy = &sysctl_jiffies,
3051 },
3052 { .ctl_name = 0 }
3053};
3054#endif
3055
3056#ifdef CONFIG_NET_CLS_ROUTE
3057struct ip_rt_acct *ip_rt_acct;
3058
3059/* This code sucks. But you should have seen it before! --RR */
3060
3061/* IP route accounting ptr for this logical cpu number. */
3062#define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3063
3064#ifdef CONFIG_PROC_FS
3065static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3066 int length, int *eof, void *data)
3067{
3068 unsigned int i;
3069
3070 if ((offset & 3) || (length & 3))
3071 return -EIO;
3072
3073 if (offset >= sizeof(struct ip_rt_acct) * 256) {
3074 *eof = 1;
3075 return 0;
3076 }
3077
3078 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3079 length = sizeof(struct ip_rt_acct) * 256 - offset;
3080 *eof = 1;
3081 }
3082
3083 offset /= sizeof(u32);
3084
3085 if (length > 0) {
3086 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3087 u32 *dst = (u32 *) buffer;
3088
3089 /* Copy first cpu. */
3090 *start = buffer;
3091 memcpy(dst, src, length);
3092
3093 /* Add the other cpus in, one int at a time */
KAMEZAWA Hiroyuki6f912042006-04-10 22:52:50 -07003094 for_each_possible_cpu(i) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003095 unsigned int j;
3096
3097 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3098
3099 for (j = 0; j < length/4; j++)
3100 dst[j] += src[j];
3101 }
3102 }
3103 return length;
3104}
3105#endif /* CONFIG_PROC_FS */
3106#endif /* CONFIG_NET_CLS_ROUTE */
3107
3108static __initdata unsigned long rhash_entries;
3109static int __init set_rhash_entries(char *str)
3110{
3111 if (!str)
3112 return 0;
3113 rhash_entries = simple_strtoul(str, &str, 0);
3114 return 1;
3115}
3116__setup("rhash_entries=", set_rhash_entries);
3117
3118int __init ip_rt_init(void)
3119{
Eric Dumazet424c4b72005-07-05 14:58:19 -07003120 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003121
3122 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3123 (jiffies ^ (jiffies >> 7)));
3124
3125#ifdef CONFIG_NET_CLS_ROUTE
Eric Dumazet424c4b72005-07-05 14:58:19 -07003126 {
3127 int order;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003128 for (order = 0;
3129 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3130 /* NOTHING */;
3131 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3132 if (!ip_rt_acct)
3133 panic("IP: failed to allocate ip_rt_acct\n");
3134 memset(ip_rt_acct, 0, PAGE_SIZE << order);
Eric Dumazet424c4b72005-07-05 14:58:19 -07003135 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003136#endif
3137
Alexey Dobriyane5d679f2006-08-26 19:25:52 -07003138 ipv4_dst_ops.kmem_cachep =
3139 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3140 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003141
Eric Dumazet424c4b72005-07-05 14:58:19 -07003142 rt_hash_table = (struct rt_hash_bucket *)
3143 alloc_large_system_hash("IP route cache",
3144 sizeof(struct rt_hash_bucket),
3145 rhash_entries,
3146 (num_physpages >= 128 * 1024) ?
Mike Stroyan18955cf2005-11-29 16:12:55 -08003147 15 : 17,
Kirill Korotaev8d1502d2006-08-07 20:44:22 -07003148 0,
Eric Dumazet424c4b72005-07-05 14:58:19 -07003149 &rt_hash_log,
3150 &rt_hash_mask,
3151 0);
Eric Dumazet22c047c2005-07-05 14:55:24 -07003152 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3153 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003154
3155 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3156 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3157
Linus Torvalds1da177e2005-04-16 15:20:36 -07003158 devinet_init();
3159 ip_fib_init();
3160
3161 init_timer(&rt_flush_timer);
3162 rt_flush_timer.function = rt_run_flush;
3163 init_timer(&rt_periodic_timer);
3164 rt_periodic_timer.function = rt_check_expire;
3165 init_timer(&rt_secret_timer);
3166 rt_secret_timer.function = rt_secret_rebuild;
3167
3168 /* All the timers, started at system startup tend
3169 to synchronize. Perturb it a bit.
3170 */
3171 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3172 ip_rt_gc_interval;
3173 add_timer(&rt_periodic_timer);
3174
3175 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3176 ip_rt_secret_interval;
3177 add_timer(&rt_secret_timer);
3178
3179#ifdef CONFIG_PROC_FS
3180 {
3181 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3182 if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3183 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3184 proc_net_stat))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003185 return -ENOMEM;
3186 }
3187 rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3188 }
3189#ifdef CONFIG_NET_CLS_ROUTE
3190 create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3191#endif
3192#endif
3193#ifdef CONFIG_XFRM
3194 xfrm_init();
3195 xfrm4_init();
3196#endif
3197 return rc;
3198}
3199
3200EXPORT_SYMBOL(__ip_select_ident);
3201EXPORT_SYMBOL(ip_route_input);
3202EXPORT_SYMBOL(ip_route_output_key);