blob: 11c167118e87fa87ecfd7898a6190c06fd24d7a0 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9 *
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 *
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
41 *
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070057 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Ilia Sotnikovcef26852006-03-25 01:38:55 -080058 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
59 * Ilia Sotnikov : Removed TOS from hash calculations
Linus Torvalds1da177e2005-04-16 15:20:36 -070060 *
61 * This program is free software; you can redistribute it and/or
62 * modify it under the terms of the GNU General Public License
63 * as published by the Free Software Foundation; either version
64 * 2 of the License, or (at your option) any later version.
65 */
66
Linus Torvalds1da177e2005-04-16 15:20:36 -070067#include <linux/module.h>
68#include <asm/uaccess.h>
69#include <asm/system.h>
70#include <linux/bitops.h>
71#include <linux/types.h>
72#include <linux/kernel.h>
73#include <linux/sched.h>
74#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070075#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070076#include <linux/string.h>
77#include <linux/socket.h>
78#include <linux/sockios.h>
79#include <linux/errno.h>
80#include <linux/in.h>
81#include <linux/inet.h>
82#include <linux/netdevice.h>
83#include <linux/proc_fs.h>
84#include <linux/init.h>
85#include <linux/skbuff.h>
86#include <linux/rtnetlink.h>
87#include <linux/inetdevice.h>
88#include <linux/igmp.h>
89#include <linux/pkt_sched.h>
90#include <linux/mroute.h>
91#include <linux/netfilter_ipv4.h>
92#include <linux/random.h>
93#include <linux/jhash.h>
94#include <linux/rcupdate.h>
95#include <linux/times.h>
96#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
106#include <net/ip_mp_alg.h>
Tom Tucker8d717402006-07-30 20:43:36 -0700107#include <net/netevent.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108#ifdef CONFIG_SYSCTL
109#include <linux/sysctl.h>
110#endif
111
112#define RT_FL_TOS(oldflp) \
113 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114
115#define IP_MAX_MTU 0xFFF0
116
117#define RT_GC_TIMEOUT (300*HZ)
118
119static int ip_rt_min_delay = 2 * HZ;
120static int ip_rt_max_delay = 10 * HZ;
121static int ip_rt_max_size;
122static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
123static int ip_rt_gc_interval = 60 * HZ;
124static int ip_rt_gc_min_interval = HZ / 2;
125static int ip_rt_redirect_number = 9;
126static int ip_rt_redirect_load = HZ / 50;
127static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
128static int ip_rt_error_cost = HZ;
129static int ip_rt_error_burst = 5 * HZ;
130static int ip_rt_gc_elasticity = 8;
131static int ip_rt_mtu_expires = 10 * 60 * HZ;
132static int ip_rt_min_pmtu = 512 + 20 + 20;
133static int ip_rt_min_advmss = 256;
134static int ip_rt_secret_interval = 10 * 60 * HZ;
135static unsigned long rt_deadline;
136
137#define RTprint(a...) printk(KERN_DEBUG a)
138
139static struct timer_list rt_flush_timer;
140static struct timer_list rt_periodic_timer;
141static struct timer_list rt_secret_timer;
142
143/*
144 * Interface to generic destination cache.
145 */
146
147static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
148static void ipv4_dst_destroy(struct dst_entry *dst);
149static void ipv4_dst_ifdown(struct dst_entry *dst,
150 struct net_device *dev, int how);
151static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
152static void ipv4_link_failure(struct sk_buff *skb);
153static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
154static int rt_garbage_collect(void);
155
156
157static struct dst_ops ipv4_dst_ops = {
158 .family = AF_INET,
159 .protocol = __constant_htons(ETH_P_IP),
160 .gc = rt_garbage_collect,
161 .check = ipv4_dst_check,
162 .destroy = ipv4_dst_destroy,
163 .ifdown = ipv4_dst_ifdown,
164 .negative_advice = ipv4_negative_advice,
165 .link_failure = ipv4_link_failure,
166 .update_pmtu = ip_rt_update_pmtu,
167 .entry_size = sizeof(struct rtable),
168};
169
170#define ECN_OR_COST(class) TC_PRIO_##class
171
172__u8 ip_tos2prio[16] = {
173 TC_PRIO_BESTEFFORT,
174 ECN_OR_COST(FILLER),
175 TC_PRIO_BESTEFFORT,
176 ECN_OR_COST(BESTEFFORT),
177 TC_PRIO_BULK,
178 ECN_OR_COST(BULK),
179 TC_PRIO_BULK,
180 ECN_OR_COST(BULK),
181 TC_PRIO_INTERACTIVE,
182 ECN_OR_COST(INTERACTIVE),
183 TC_PRIO_INTERACTIVE,
184 ECN_OR_COST(INTERACTIVE),
185 TC_PRIO_INTERACTIVE_BULK,
186 ECN_OR_COST(INTERACTIVE_BULK),
187 TC_PRIO_INTERACTIVE_BULK,
188 ECN_OR_COST(INTERACTIVE_BULK)
189};
190
191
192/*
193 * Route cache.
194 */
195
196/* The locking scheme is rather straight forward:
197 *
198 * 1) Read-Copy Update protects the buckets of the central route hash.
199 * 2) Only writers remove entries, and they hold the lock
200 * as they look at rtable reference counts.
201 * 3) Only readers acquire references to rtable entries,
202 * they do so with atomic increments and with the
203 * lock held.
204 */
205
206struct rt_hash_bucket {
207 struct rtable *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700208};
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700209#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
210 defined(CONFIG_PROVE_LOCKING)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700211/*
212 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
213 * The size of this table is a power of two and depends on the number of CPUS.
Ingo Molnar62051202006-07-03 00:24:59 -0700214 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700215 */
Ingo Molnar62051202006-07-03 00:24:59 -0700216#ifdef CONFIG_LOCKDEP
217# define RT_HASH_LOCK_SZ 256
Eric Dumazet22c047c2005-07-05 14:55:24 -0700218#else
Ingo Molnar62051202006-07-03 00:24:59 -0700219# if NR_CPUS >= 32
220# define RT_HASH_LOCK_SZ 4096
221# elif NR_CPUS >= 16
222# define RT_HASH_LOCK_SZ 2048
223# elif NR_CPUS >= 8
224# define RT_HASH_LOCK_SZ 1024
225# elif NR_CPUS >= 4
226# define RT_HASH_LOCK_SZ 512
227# else
228# define RT_HASH_LOCK_SZ 256
229# endif
Eric Dumazet22c047c2005-07-05 14:55:24 -0700230#endif
231
232static spinlock_t *rt_hash_locks;
233# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
234# define rt_hash_lock_init() { \
235 int i; \
236 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
237 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
238 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
239 spin_lock_init(&rt_hash_locks[i]); \
240 }
241#else
242# define rt_hash_lock_addr(slot) NULL
243# define rt_hash_lock_init()
244#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700245
246static struct rt_hash_bucket *rt_hash_table;
247static unsigned rt_hash_mask;
248static int rt_hash_log;
249static unsigned int rt_hash_rnd;
250
Eric Dumazet2f970d82006-01-17 02:54:36 -0800251static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
Andrew Mortondbd29152006-01-17 21:58:01 -0800252#define RT_CACHE_STAT_INC(field) \
Paul Mackerrasbfe5d832006-06-25 05:47:14 -0700253 (__raw_get_cpu_var(rt_cache_stat).field++)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700254
255static int rt_intern_hash(unsigned hash, struct rtable *rth,
256 struct rtable **res);
257
Ilia Sotnikovcef26852006-03-25 01:38:55 -0800258static unsigned int rt_hash_code(u32 daddr, u32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700259{
Ilia Sotnikovcef26852006-03-25 01:38:55 -0800260 return (jhash_2words(daddr, saddr, rt_hash_rnd)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700261 & rt_hash_mask);
262}
263
Al Viro8c7bc842006-09-26 21:26:19 -0700264#define rt_hash(daddr, saddr, idx) \
265 rt_hash_code((__force u32)(__be32)(daddr),\
266 (__force u32)(__be32)(saddr) ^ ((idx) << 5))
267
Linus Torvalds1da177e2005-04-16 15:20:36 -0700268#ifdef CONFIG_PROC_FS
269struct rt_cache_iter_state {
270 int bucket;
271};
272
273static struct rtable *rt_cache_get_first(struct seq_file *seq)
274{
275 struct rtable *r = NULL;
276 struct rt_cache_iter_state *st = seq->private;
277
278 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
279 rcu_read_lock_bh();
280 r = rt_hash_table[st->bucket].chain;
281 if (r)
282 break;
283 rcu_read_unlock_bh();
284 }
285 return r;
286}
287
288static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
289{
290 struct rt_cache_iter_state *st = rcu_dereference(seq->private);
291
292 r = r->u.rt_next;
293 while (!r) {
294 rcu_read_unlock_bh();
295 if (--st->bucket < 0)
296 break;
297 rcu_read_lock_bh();
298 r = rt_hash_table[st->bucket].chain;
299 }
300 return r;
301}
302
303static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
304{
305 struct rtable *r = rt_cache_get_first(seq);
306
307 if (r)
308 while (pos && (r = rt_cache_get_next(seq, r)))
309 --pos;
310 return pos ? NULL : r;
311}
312
313static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
314{
315 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
316}
317
318static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
319{
320 struct rtable *r = NULL;
321
322 if (v == SEQ_START_TOKEN)
323 r = rt_cache_get_first(seq);
324 else
325 r = rt_cache_get_next(seq, v);
326 ++*pos;
327 return r;
328}
329
330static void rt_cache_seq_stop(struct seq_file *seq, void *v)
331{
332 if (v && v != SEQ_START_TOKEN)
333 rcu_read_unlock_bh();
334}
335
336static int rt_cache_seq_show(struct seq_file *seq, void *v)
337{
338 if (v == SEQ_START_TOKEN)
339 seq_printf(seq, "%-127s\n",
340 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
341 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
342 "HHUptod\tSpecDst");
343 else {
344 struct rtable *r = v;
345 char temp[256];
346
347 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
348 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
349 r->u.dst.dev ? r->u.dst.dev->name : "*",
350 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
351 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
352 r->u.dst.__use, 0, (unsigned long)r->rt_src,
353 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
354 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
355 dst_metric(&r->u.dst, RTAX_WINDOW),
356 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
357 dst_metric(&r->u.dst, RTAX_RTTVAR)),
358 r->fl.fl4_tos,
359 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
360 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
361 dev_queue_xmit) : 0,
362 r->rt_spec_dst);
363 seq_printf(seq, "%-127s\n", temp);
364 }
365 return 0;
366}
367
368static struct seq_operations rt_cache_seq_ops = {
369 .start = rt_cache_seq_start,
370 .next = rt_cache_seq_next,
371 .stop = rt_cache_seq_stop,
372 .show = rt_cache_seq_show,
373};
374
375static int rt_cache_seq_open(struct inode *inode, struct file *file)
376{
377 struct seq_file *seq;
378 int rc = -ENOMEM;
379 struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
380
381 if (!s)
382 goto out;
383 rc = seq_open(file, &rt_cache_seq_ops);
384 if (rc)
385 goto out_kfree;
386 seq = file->private_data;
387 seq->private = s;
388 memset(s, 0, sizeof(*s));
389out:
390 return rc;
391out_kfree:
392 kfree(s);
393 goto out;
394}
395
396static struct file_operations rt_cache_seq_fops = {
397 .owner = THIS_MODULE,
398 .open = rt_cache_seq_open,
399 .read = seq_read,
400 .llseek = seq_lseek,
401 .release = seq_release_private,
402};
403
404
405static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
406{
407 int cpu;
408
409 if (*pos == 0)
410 return SEQ_START_TOKEN;
411
412 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
413 if (!cpu_possible(cpu))
414 continue;
415 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800416 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700417 }
418 return NULL;
419}
420
421static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
422{
423 int cpu;
424
425 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
426 if (!cpu_possible(cpu))
427 continue;
428 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800429 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700430 }
431 return NULL;
432
433}
434
435static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
436{
437
438}
439
440static int rt_cpu_seq_show(struct seq_file *seq, void *v)
441{
442 struct rt_cache_stat *st = v;
443
444 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700445 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700446 return 0;
447 }
448
449 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
450 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
451 atomic_read(&ipv4_dst_ops.entries),
452 st->in_hit,
453 st->in_slow_tot,
454 st->in_slow_mc,
455 st->in_no_route,
456 st->in_brd,
457 st->in_martian_dst,
458 st->in_martian_src,
459
460 st->out_hit,
461 st->out_slow_tot,
462 st->out_slow_mc,
463
464 st->gc_total,
465 st->gc_ignored,
466 st->gc_goal_miss,
467 st->gc_dst_overflow,
468 st->in_hlist_search,
469 st->out_hlist_search
470 );
471 return 0;
472}
473
474static struct seq_operations rt_cpu_seq_ops = {
475 .start = rt_cpu_seq_start,
476 .next = rt_cpu_seq_next,
477 .stop = rt_cpu_seq_stop,
478 .show = rt_cpu_seq_show,
479};
480
481
482static int rt_cpu_seq_open(struct inode *inode, struct file *file)
483{
484 return seq_open(file, &rt_cpu_seq_ops);
485}
486
487static struct file_operations rt_cpu_seq_fops = {
488 .owner = THIS_MODULE,
489 .open = rt_cpu_seq_open,
490 .read = seq_read,
491 .llseek = seq_lseek,
492 .release = seq_release,
493};
494
495#endif /* CONFIG_PROC_FS */
496
497static __inline__ void rt_free(struct rtable *rt)
498{
499 multipath_remove(rt);
500 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
501}
502
503static __inline__ void rt_drop(struct rtable *rt)
504{
505 multipath_remove(rt);
506 ip_rt_put(rt);
507 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
508}
509
510static __inline__ int rt_fast_clean(struct rtable *rth)
511{
512 /* Kill broadcast/multicast entries very aggresively, if they
513 collide in hash table with more useful entries */
514 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
515 rth->fl.iif && rth->u.rt_next;
516}
517
518static __inline__ int rt_valuable(struct rtable *rth)
519{
520 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
521 rth->u.dst.expires;
522}
523
524static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
525{
526 unsigned long age;
527 int ret = 0;
528
529 if (atomic_read(&rth->u.dst.__refcnt))
530 goto out;
531
532 ret = 1;
533 if (rth->u.dst.expires &&
534 time_after_eq(jiffies, rth->u.dst.expires))
535 goto out;
536
537 age = jiffies - rth->u.dst.lastuse;
538 ret = 0;
539 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
540 (age <= tmo2 && rt_valuable(rth)))
541 goto out;
542 ret = 1;
543out: return ret;
544}
545
546/* Bits of score are:
547 * 31: very valuable
548 * 30: not quite useless
549 * 29..0: usage counter
550 */
551static inline u32 rt_score(struct rtable *rt)
552{
553 u32 score = jiffies - rt->u.dst.lastuse;
554
555 score = ~score & ~(3<<30);
556
557 if (rt_valuable(rt))
558 score |= (1<<31);
559
560 if (!rt->fl.iif ||
561 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
562 score |= (1<<30);
563
564 return score;
565}
566
567static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
568{
Al Viro714e85b2006-11-14 20:51:49 -0800569 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
570 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
Thomas Graf47dcf0c2006-11-09 15:20:38 -0800571 (fl1->mark ^ fl2->mark) |
David S. Miller8238b212006-10-12 00:49:15 -0700572 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
573 *(u16 *)&fl2->nl_u.ip4_u.tos) |
574 (fl1->oif ^ fl2->oif) |
575 (fl1->iif ^ fl2->iif)) == 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700576}
577
578#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
579static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
580 struct rtable *expentry,
581 int *removed_count)
582{
583 int passedexpired = 0;
584 struct rtable **nextstep = NULL;
585 struct rtable **rthp = chain_head;
586 struct rtable *rth;
587
588 if (removed_count)
589 *removed_count = 0;
590
591 while ((rth = *rthp) != NULL) {
592 if (rth == expentry)
593 passedexpired = 1;
594
595 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0 &&
596 compare_keys(&(*rthp)->fl, &expentry->fl)) {
597 if (*rthp == expentry) {
598 *rthp = rth->u.rt_next;
599 continue;
600 } else {
601 *rthp = rth->u.rt_next;
602 rt_free(rth);
603 if (removed_count)
604 ++(*removed_count);
605 }
606 } else {
607 if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
608 passedexpired && !nextstep)
609 nextstep = &rth->u.rt_next;
610
611 rthp = &rth->u.rt_next;
612 }
613 }
614
615 rt_free(expentry);
616 if (removed_count)
617 ++(*removed_count);
618
619 return nextstep;
620}
621#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
622
623
624/* This runs via a timer and thus is always in BH context. */
625static void rt_check_expire(unsigned long dummy)
626{
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700627 static unsigned int rover;
628 unsigned int i = rover, goal;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700629 struct rtable *rth, **rthp;
630 unsigned long now = jiffies;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700631 u64 mult;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700632
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700633 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
634 if (ip_rt_gc_timeout > 1)
635 do_div(mult, ip_rt_gc_timeout);
636 goal = (unsigned int)mult;
637 if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
638 for (; goal > 0; goal--) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700639 unsigned long tmo = ip_rt_gc_timeout;
640
641 i = (i + 1) & rt_hash_mask;
642 rthp = &rt_hash_table[i].chain;
643
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700644 if (*rthp == 0)
645 continue;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700646 spin_lock(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700647 while ((rth = *rthp) != NULL) {
648 if (rth->u.dst.expires) {
649 /* Entry is expired even if it is in use */
650 if (time_before_eq(now, rth->u.dst.expires)) {
651 tmo >>= 1;
652 rthp = &rth->u.rt_next;
653 continue;
654 }
655 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
656 tmo >>= 1;
657 rthp = &rth->u.rt_next;
658 continue;
659 }
660
661 /* Cleanup aged off entries. */
662#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
663 /* remove all related balanced entries if necessary */
664 if (rth->u.dst.flags & DST_BALANCED) {
665 rthp = rt_remove_balanced_route(
666 &rt_hash_table[i].chain,
667 rth, NULL);
668 if (!rthp)
669 break;
670 } else {
671 *rthp = rth->u.rt_next;
672 rt_free(rth);
673 }
674#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
675 *rthp = rth->u.rt_next;
676 rt_free(rth);
677#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
678 }
Eric Dumazet22c047c2005-07-05 14:55:24 -0700679 spin_unlock(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700680
681 /* Fallback loop breaker. */
682 if (time_after(jiffies, now))
683 break;
684 }
685 rover = i;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700686 mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700687}
688
689/* This can run from both BH and non-BH contexts, the latter
690 * in the case of a forced flush event.
691 */
692static void rt_run_flush(unsigned long dummy)
693{
694 int i;
695 struct rtable *rth, *next;
696
697 rt_deadline = 0;
698
699 get_random_bytes(&rt_hash_rnd, 4);
700
701 for (i = rt_hash_mask; i >= 0; i--) {
Eric Dumazet22c047c2005-07-05 14:55:24 -0700702 spin_lock_bh(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700703 rth = rt_hash_table[i].chain;
704 if (rth)
705 rt_hash_table[i].chain = NULL;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700706 spin_unlock_bh(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700707
708 for (; rth; rth = next) {
709 next = rth->u.rt_next;
710 rt_free(rth);
711 }
712 }
713}
714
715static DEFINE_SPINLOCK(rt_flush_lock);
716
717void rt_cache_flush(int delay)
718{
719 unsigned long now = jiffies;
720 int user_mode = !in_softirq();
721
722 if (delay < 0)
723 delay = ip_rt_min_delay;
724
725 /* flush existing multipath state*/
726 multipath_flush();
727
728 spin_lock_bh(&rt_flush_lock);
729
730 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
731 long tmo = (long)(rt_deadline - now);
732
733 /* If flush timer is already running
734 and flush request is not immediate (delay > 0):
735
736 if deadline is not achieved, prolongate timer to "delay",
737 otherwise fire it at deadline time.
738 */
739
740 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
741 tmo = 0;
742
743 if (delay > tmo)
744 delay = tmo;
745 }
746
747 if (delay <= 0) {
748 spin_unlock_bh(&rt_flush_lock);
749 rt_run_flush(0);
750 return;
751 }
752
753 if (rt_deadline == 0)
754 rt_deadline = now + ip_rt_max_delay;
755
756 mod_timer(&rt_flush_timer, now+delay);
757 spin_unlock_bh(&rt_flush_lock);
758}
759
760static void rt_secret_rebuild(unsigned long dummy)
761{
762 unsigned long now = jiffies;
763
764 rt_cache_flush(0);
765 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
766}
767
768/*
769 Short description of GC goals.
770
771 We want to build algorithm, which will keep routing cache
772 at some equilibrium point, when number of aged off entries
773 is kept approximately equal to newly generated ones.
774
775 Current expiration strength is variable "expire".
776 We try to adjust it dynamically, so that if networking
777 is idle expires is large enough to keep enough of warm entries,
778 and when load increases it reduces to limit cache size.
779 */
780
781static int rt_garbage_collect(void)
782{
783 static unsigned long expire = RT_GC_TIMEOUT;
784 static unsigned long last_gc;
785 static int rover;
786 static int equilibrium;
787 struct rtable *rth, **rthp;
788 unsigned long now = jiffies;
789 int goal;
790
791 /*
792 * Garbage collection is pretty expensive,
793 * do not make it too frequently.
794 */
795
796 RT_CACHE_STAT_INC(gc_total);
797
798 if (now - last_gc < ip_rt_gc_min_interval &&
799 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
800 RT_CACHE_STAT_INC(gc_ignored);
801 goto out;
802 }
803
804 /* Calculate number of entries, which we want to expire now. */
805 goal = atomic_read(&ipv4_dst_ops.entries) -
806 (ip_rt_gc_elasticity << rt_hash_log);
807 if (goal <= 0) {
808 if (equilibrium < ipv4_dst_ops.gc_thresh)
809 equilibrium = ipv4_dst_ops.gc_thresh;
810 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
811 if (goal > 0) {
812 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
813 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
814 }
815 } else {
816 /* We are in dangerous area. Try to reduce cache really
817 * aggressively.
818 */
819 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
820 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
821 }
822
823 if (now - last_gc >= ip_rt_gc_min_interval)
824 last_gc = now;
825
826 if (goal <= 0) {
827 equilibrium += goal;
828 goto work_done;
829 }
830
831 do {
832 int i, k;
833
834 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
835 unsigned long tmo = expire;
836
837 k = (k + 1) & rt_hash_mask;
838 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700839 spin_lock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700840 while ((rth = *rthp) != NULL) {
841 if (!rt_may_expire(rth, tmo, expire)) {
842 tmo >>= 1;
843 rthp = &rth->u.rt_next;
844 continue;
845 }
846#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
847 /* remove all related balanced entries
848 * if necessary
849 */
850 if (rth->u.dst.flags & DST_BALANCED) {
851 int r;
852
853 rthp = rt_remove_balanced_route(
Suresh Bhogavilli85259872006-02-21 13:42:22 -0800854 &rt_hash_table[k].chain,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700855 rth,
856 &r);
857 goal -= r;
858 if (!rthp)
859 break;
860 } else {
861 *rthp = rth->u.rt_next;
862 rt_free(rth);
863 goal--;
864 }
865#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
866 *rthp = rth->u.rt_next;
867 rt_free(rth);
868 goal--;
869#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
870 }
Eric Dumazet22c047c2005-07-05 14:55:24 -0700871 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700872 if (goal <= 0)
873 break;
874 }
875 rover = k;
876
877 if (goal <= 0)
878 goto work_done;
879
880 /* Goal is not achieved. We stop process if:
881
882 - if expire reduced to zero. Otherwise, expire is halfed.
883 - if table is not full.
884 - if we are called from interrupt.
885 - jiffies check is just fallback/debug loop breaker.
886 We will not spin here for long time in any case.
887 */
888
889 RT_CACHE_STAT_INC(gc_goal_miss);
890
891 if (expire == 0)
892 break;
893
894 expire >>= 1;
895#if RT_CACHE_DEBUG >= 2
896 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
897 atomic_read(&ipv4_dst_ops.entries), goal, i);
898#endif
899
900 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
901 goto out;
902 } while (!in_softirq() && time_before_eq(jiffies, now));
903
904 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
905 goto out;
906 if (net_ratelimit())
907 printk(KERN_WARNING "dst cache overflow\n");
908 RT_CACHE_STAT_INC(gc_dst_overflow);
909 return 1;
910
911work_done:
912 expire += ip_rt_gc_min_interval;
913 if (expire > ip_rt_gc_timeout ||
914 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
915 expire = ip_rt_gc_timeout;
916#if RT_CACHE_DEBUG >= 2
917 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
918 atomic_read(&ipv4_dst_ops.entries), goal, rover);
919#endif
920out: return 0;
921}
922
923static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
924{
925 struct rtable *rth, **rthp;
926 unsigned long now;
927 struct rtable *cand, **candp;
928 u32 min_score;
929 int chain_length;
930 int attempts = !in_softirq();
931
932restart:
933 chain_length = 0;
934 min_score = ~(u32)0;
935 cand = NULL;
936 candp = NULL;
937 now = jiffies;
938
939 rthp = &rt_hash_table[hash].chain;
940
Eric Dumazet22c047c2005-07-05 14:55:24 -0700941 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700942 while ((rth = *rthp) != NULL) {
943#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
944 if (!(rth->u.dst.flags & DST_BALANCED) &&
945 compare_keys(&rth->fl, &rt->fl)) {
946#else
947 if (compare_keys(&rth->fl, &rt->fl)) {
948#endif
949 /* Put it first */
950 *rthp = rth->u.rt_next;
951 /*
952 * Since lookup is lockfree, the deletion
953 * must be visible to another weakly ordered CPU before
954 * the insertion at the start of the hash chain.
955 */
956 rcu_assign_pointer(rth->u.rt_next,
957 rt_hash_table[hash].chain);
958 /*
959 * Since lookup is lockfree, the update writes
960 * must be ordered for consistency on SMP.
961 */
962 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
963
964 rth->u.dst.__use++;
965 dst_hold(&rth->u.dst);
966 rth->u.dst.lastuse = now;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700967 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700968
969 rt_drop(rt);
970 *rp = rth;
971 return 0;
972 }
973
974 if (!atomic_read(&rth->u.dst.__refcnt)) {
975 u32 score = rt_score(rth);
976
977 if (score <= min_score) {
978 cand = rth;
979 candp = rthp;
980 min_score = score;
981 }
982 }
983
984 chain_length++;
985
986 rthp = &rth->u.rt_next;
987 }
988
989 if (cand) {
990 /* ip_rt_gc_elasticity used to be average length of chain
991 * length, when exceeded gc becomes really aggressive.
992 *
993 * The second limit is less certain. At the moment it allows
994 * only 2 entries per bucket. We will see.
995 */
996 if (chain_length > ip_rt_gc_elasticity) {
997 *candp = cand->u.rt_next;
998 rt_free(cand);
999 }
1000 }
1001
1002 /* Try to bind route to arp only if it is output
1003 route or unicast forwarding path.
1004 */
1005 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1006 int err = arp_bind_neighbour(&rt->u.dst);
1007 if (err) {
Eric Dumazet22c047c2005-07-05 14:55:24 -07001008 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001009
1010 if (err != -ENOBUFS) {
1011 rt_drop(rt);
1012 return err;
1013 }
1014
1015 /* Neighbour tables are full and nothing
1016 can be released. Try to shrink route cache,
1017 it is most likely it holds some neighbour records.
1018 */
1019 if (attempts-- > 0) {
1020 int saved_elasticity = ip_rt_gc_elasticity;
1021 int saved_int = ip_rt_gc_min_interval;
1022 ip_rt_gc_elasticity = 1;
1023 ip_rt_gc_min_interval = 0;
1024 rt_garbage_collect();
1025 ip_rt_gc_min_interval = saved_int;
1026 ip_rt_gc_elasticity = saved_elasticity;
1027 goto restart;
1028 }
1029
1030 if (net_ratelimit())
1031 printk(KERN_WARNING "Neighbour table overflow.\n");
1032 rt_drop(rt);
1033 return -ENOBUFS;
1034 }
1035 }
1036
1037 rt->u.rt_next = rt_hash_table[hash].chain;
1038#if RT_CACHE_DEBUG >= 2
1039 if (rt->u.rt_next) {
1040 struct rtable *trt;
1041 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1042 NIPQUAD(rt->rt_dst));
1043 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1044 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1045 printk("\n");
1046 }
1047#endif
1048 rt_hash_table[hash].chain = rt;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001049 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001050 *rp = rt;
1051 return 0;
1052}
1053
1054void rt_bind_peer(struct rtable *rt, int create)
1055{
1056 static DEFINE_SPINLOCK(rt_peer_lock);
1057 struct inet_peer *peer;
1058
1059 peer = inet_getpeer(rt->rt_dst, create);
1060
1061 spin_lock_bh(&rt_peer_lock);
1062 if (rt->peer == NULL) {
1063 rt->peer = peer;
1064 peer = NULL;
1065 }
1066 spin_unlock_bh(&rt_peer_lock);
1067 if (peer)
1068 inet_putpeer(peer);
1069}
1070
1071/*
1072 * Peer allocation may fail only in serious out-of-memory conditions. However
1073 * we still can generate some output.
1074 * Random ID selection looks a bit dangerous because we have no chances to
1075 * select ID being unique in a reasonable period of time.
1076 * But broken packet identifier may be better than no packet at all.
1077 */
1078static void ip_select_fb_ident(struct iphdr *iph)
1079{
1080 static DEFINE_SPINLOCK(ip_fb_id_lock);
1081 static u32 ip_fallback_id;
1082 u32 salt;
1083
1084 spin_lock_bh(&ip_fb_id_lock);
Al Viroe4485152006-09-26 22:15:01 -07001085 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001086 iph->id = htons(salt & 0xFFFF);
1087 ip_fallback_id = salt;
1088 spin_unlock_bh(&ip_fb_id_lock);
1089}
1090
1091void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1092{
1093 struct rtable *rt = (struct rtable *) dst;
1094
1095 if (rt) {
1096 if (rt->peer == NULL)
1097 rt_bind_peer(rt, 1);
1098
1099 /* If peer is attached to destination, it is never detached,
1100 so that we need not to grab a lock to dereference it.
1101 */
1102 if (rt->peer) {
1103 iph->id = htons(inet_getid(rt->peer, more));
1104 return;
1105 }
1106 } else
Stephen Hemminger9c2b3322005-04-19 22:39:42 -07001107 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1108 __builtin_return_address(0));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001109
1110 ip_select_fb_ident(iph);
1111}
1112
1113static void rt_del(unsigned hash, struct rtable *rt)
1114{
1115 struct rtable **rthp;
1116
Eric Dumazet22c047c2005-07-05 14:55:24 -07001117 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001118 ip_rt_put(rt);
1119 for (rthp = &rt_hash_table[hash].chain; *rthp;
1120 rthp = &(*rthp)->u.rt_next)
1121 if (*rthp == rt) {
1122 *rthp = rt->u.rt_next;
1123 rt_free(rt);
1124 break;
1125 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001126 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001127}
1128
Al Virof7655222006-09-26 21:25:43 -07001129void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1130 __be32 saddr, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001131{
1132 int i, k;
1133 struct in_device *in_dev = in_dev_get(dev);
1134 struct rtable *rth, **rthp;
Al Virof7655222006-09-26 21:25:43 -07001135 __be32 skeys[2] = { saddr, 0 };
Linus Torvalds1da177e2005-04-16 15:20:36 -07001136 int ikeys[2] = { dev->ifindex, 0 };
Tom Tucker8d717402006-07-30 20:43:36 -07001137 struct netevent_redirect netevent;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001138
Linus Torvalds1da177e2005-04-16 15:20:36 -07001139 if (!in_dev)
1140 return;
1141
1142 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1143 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1144 goto reject_redirect;
1145
1146 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1147 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1148 goto reject_redirect;
1149 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1150 goto reject_redirect;
1151 } else {
1152 if (inet_addr_type(new_gw) != RTN_UNICAST)
1153 goto reject_redirect;
1154 }
1155
1156 for (i = 0; i < 2; i++) {
1157 for (k = 0; k < 2; k++) {
Al Viro8c7bc842006-09-26 21:26:19 -07001158 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001159
1160 rthp=&rt_hash_table[hash].chain;
1161
1162 rcu_read_lock();
1163 while ((rth = rcu_dereference(*rthp)) != NULL) {
1164 struct rtable *rt;
1165
1166 if (rth->fl.fl4_dst != daddr ||
1167 rth->fl.fl4_src != skeys[i] ||
Linus Torvalds1da177e2005-04-16 15:20:36 -07001168 rth->fl.oif != ikeys[k] ||
1169 rth->fl.iif != 0) {
1170 rthp = &rth->u.rt_next;
1171 continue;
1172 }
1173
1174 if (rth->rt_dst != daddr ||
1175 rth->rt_src != saddr ||
1176 rth->u.dst.error ||
1177 rth->rt_gateway != old_gw ||
1178 rth->u.dst.dev != dev)
1179 break;
1180
1181 dst_hold(&rth->u.dst);
1182 rcu_read_unlock();
1183
1184 rt = dst_alloc(&ipv4_dst_ops);
1185 if (rt == NULL) {
1186 ip_rt_put(rth);
1187 in_dev_put(in_dev);
1188 return;
1189 }
1190
1191 /* Copy all the information. */
1192 *rt = *rth;
1193 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1194 rt->u.dst.__use = 1;
1195 atomic_set(&rt->u.dst.__refcnt, 1);
1196 rt->u.dst.child = NULL;
1197 if (rt->u.dst.dev)
1198 dev_hold(rt->u.dst.dev);
1199 if (rt->idev)
1200 in_dev_hold(rt->idev);
1201 rt->u.dst.obsolete = 0;
1202 rt->u.dst.lastuse = jiffies;
1203 rt->u.dst.path = &rt->u.dst;
1204 rt->u.dst.neighbour = NULL;
1205 rt->u.dst.hh = NULL;
1206 rt->u.dst.xfrm = NULL;
1207
1208 rt->rt_flags |= RTCF_REDIRECTED;
1209
1210 /* Gateway is different ... */
1211 rt->rt_gateway = new_gw;
1212
1213 /* Redirect received -> path was valid */
1214 dst_confirm(&rth->u.dst);
1215
1216 if (rt->peer)
1217 atomic_inc(&rt->peer->refcnt);
1218
1219 if (arp_bind_neighbour(&rt->u.dst) ||
1220 !(rt->u.dst.neighbour->nud_state &
1221 NUD_VALID)) {
1222 if (rt->u.dst.neighbour)
1223 neigh_event_send(rt->u.dst.neighbour, NULL);
1224 ip_rt_put(rth);
1225 rt_drop(rt);
1226 goto do_next;
1227 }
Tom Tucker8d717402006-07-30 20:43:36 -07001228
1229 netevent.old = &rth->u.dst;
1230 netevent.new = &rt->u.dst;
1231 call_netevent_notifiers(NETEVENT_REDIRECT,
1232 &netevent);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001233
1234 rt_del(hash, rth);
1235 if (!rt_intern_hash(hash, rt, &rt))
1236 ip_rt_put(rt);
1237 goto do_next;
1238 }
1239 rcu_read_unlock();
1240 do_next:
1241 ;
1242 }
1243 }
1244 in_dev_put(in_dev);
1245 return;
1246
1247reject_redirect:
1248#ifdef CONFIG_IP_ROUTE_VERBOSE
1249 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1250 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1251 "%u.%u.%u.%u ignored.\n"
Ilia Sotnikovcef26852006-03-25 01:38:55 -08001252 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07001253 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
Ilia Sotnikovcef26852006-03-25 01:38:55 -08001254 NIPQUAD(saddr), NIPQUAD(daddr));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001255#endif
1256 in_dev_put(in_dev);
1257}
1258
1259static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1260{
1261 struct rtable *rt = (struct rtable*)dst;
1262 struct dst_entry *ret = dst;
1263
1264 if (rt) {
1265 if (dst->obsolete) {
1266 ip_rt_put(rt);
1267 ret = NULL;
1268 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1269 rt->u.dst.expires) {
Al Viro8c7bc842006-09-26 21:26:19 -07001270 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1271 rt->fl.oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001272#if RT_CACHE_DEBUG >= 1
1273 printk(KERN_DEBUG "ip_rt_advice: redirect to "
1274 "%u.%u.%u.%u/%02x dropped\n",
1275 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1276#endif
1277 rt_del(hash, rt);
1278 ret = NULL;
1279 }
1280 }
1281 return ret;
1282}
1283
1284/*
1285 * Algorithm:
1286 * 1. The first ip_rt_redirect_number redirects are sent
1287 * with exponential backoff, then we stop sending them at all,
1288 * assuming that the host ignores our redirects.
1289 * 2. If we did not see packets requiring redirects
1290 * during ip_rt_redirect_silence, we assume that the host
1291 * forgot redirected route and start to send redirects again.
1292 *
1293 * This algorithm is much cheaper and more intelligent than dumb load limiting
1294 * in icmp.c.
1295 *
1296 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1297 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1298 */
1299
1300void ip_rt_send_redirect(struct sk_buff *skb)
1301{
1302 struct rtable *rt = (struct rtable*)skb->dst;
1303 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1304
1305 if (!in_dev)
1306 return;
1307
1308 if (!IN_DEV_TX_REDIRECTS(in_dev))
1309 goto out;
1310
1311 /* No redirected packets during ip_rt_redirect_silence;
1312 * reset the algorithm.
1313 */
1314 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1315 rt->u.dst.rate_tokens = 0;
1316
1317 /* Too many ignored redirects; do not send anything
1318 * set u.dst.rate_last to the last seen redirected packet.
1319 */
1320 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1321 rt->u.dst.rate_last = jiffies;
1322 goto out;
1323 }
1324
1325 /* Check for load limit; set rate_last to the latest sent
1326 * redirect.
1327 */
1328 if (time_after(jiffies,
1329 (rt->u.dst.rate_last +
1330 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1331 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1332 rt->u.dst.rate_last = jiffies;
1333 ++rt->u.dst.rate_tokens;
1334#ifdef CONFIG_IP_ROUTE_VERBOSE
1335 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1336 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1337 net_ratelimit())
1338 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1339 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1340 NIPQUAD(rt->rt_src), rt->rt_iif,
1341 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1342#endif
1343 }
1344out:
1345 in_dev_put(in_dev);
1346}
1347
1348static int ip_error(struct sk_buff *skb)
1349{
1350 struct rtable *rt = (struct rtable*)skb->dst;
1351 unsigned long now;
1352 int code;
1353
1354 switch (rt->u.dst.error) {
1355 case EINVAL:
1356 default:
1357 goto out;
1358 case EHOSTUNREACH:
1359 code = ICMP_HOST_UNREACH;
1360 break;
1361 case ENETUNREACH:
1362 code = ICMP_NET_UNREACH;
1363 break;
1364 case EACCES:
1365 code = ICMP_PKT_FILTERED;
1366 break;
1367 }
1368
1369 now = jiffies;
1370 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1371 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1372 rt->u.dst.rate_tokens = ip_rt_error_burst;
1373 rt->u.dst.rate_last = now;
1374 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1375 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1376 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1377 }
1378
1379out: kfree_skb(skb);
1380 return 0;
1381}
1382
1383/*
1384 * The last two values are not from the RFC but
1385 * are needed for AMPRnet AX.25 paths.
1386 */
1387
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001388static const unsigned short mtu_plateau[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -07001389{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1390
1391static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1392{
1393 int i;
1394
1395 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1396 if (old_mtu > mtu_plateau[i])
1397 return mtu_plateau[i];
1398 return 68;
1399}
1400
1401unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1402{
1403 int i;
1404 unsigned short old_mtu = ntohs(iph->tot_len);
1405 struct rtable *rth;
Al Viroe4485152006-09-26 22:15:01 -07001406 __be32 skeys[2] = { iph->saddr, 0, };
1407 __be32 daddr = iph->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001408 unsigned short est_mtu = 0;
1409
1410 if (ipv4_config.no_pmtu_disc)
1411 return 0;
1412
1413 for (i = 0; i < 2; i++) {
Al Viro8c7bc842006-09-26 21:26:19 -07001414 unsigned hash = rt_hash(daddr, skeys[i], 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001415
1416 rcu_read_lock();
1417 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1418 rth = rcu_dereference(rth->u.rt_next)) {
1419 if (rth->fl.fl4_dst == daddr &&
1420 rth->fl.fl4_src == skeys[i] &&
1421 rth->rt_dst == daddr &&
1422 rth->rt_src == iph->saddr &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001423 rth->fl.iif == 0 &&
1424 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1425 unsigned short mtu = new_mtu;
1426
1427 if (new_mtu < 68 || new_mtu >= old_mtu) {
1428
1429 /* BSD 4.2 compatibility hack :-( */
1430 if (mtu == 0 &&
1431 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1432 old_mtu >= 68 + (iph->ihl << 2))
1433 old_mtu -= iph->ihl << 2;
1434
1435 mtu = guess_mtu(old_mtu);
1436 }
1437 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1438 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1439 dst_confirm(&rth->u.dst);
1440 if (mtu < ip_rt_min_pmtu) {
1441 mtu = ip_rt_min_pmtu;
1442 rth->u.dst.metrics[RTAX_LOCK-1] |=
1443 (1 << RTAX_MTU);
1444 }
1445 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1446 dst_set_expires(&rth->u.dst,
1447 ip_rt_mtu_expires);
1448 }
1449 est_mtu = mtu;
1450 }
1451 }
1452 }
1453 rcu_read_unlock();
1454 }
1455 return est_mtu ? : new_mtu;
1456}
1457
1458static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1459{
1460 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1461 !(dst_metric_locked(dst, RTAX_MTU))) {
1462 if (mtu < ip_rt_min_pmtu) {
1463 mtu = ip_rt_min_pmtu;
1464 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1465 }
1466 dst->metrics[RTAX_MTU-1] = mtu;
1467 dst_set_expires(dst, ip_rt_mtu_expires);
Tom Tucker8d717402006-07-30 20:43:36 -07001468 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001469 }
1470}
1471
1472static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1473{
1474 return NULL;
1475}
1476
1477static void ipv4_dst_destroy(struct dst_entry *dst)
1478{
1479 struct rtable *rt = (struct rtable *) dst;
1480 struct inet_peer *peer = rt->peer;
1481 struct in_device *idev = rt->idev;
1482
1483 if (peer) {
1484 rt->peer = NULL;
1485 inet_putpeer(peer);
1486 }
1487
1488 if (idev) {
1489 rt->idev = NULL;
1490 in_dev_put(idev);
1491 }
1492}
1493
1494static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1495 int how)
1496{
1497 struct rtable *rt = (struct rtable *) dst;
1498 struct in_device *idev = rt->idev;
1499 if (dev != &loopback_dev && idev && idev->dev == dev) {
1500 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1501 if (loopback_idev) {
1502 rt->idev = loopback_idev;
1503 in_dev_put(idev);
1504 }
1505 }
1506}
1507
1508static void ipv4_link_failure(struct sk_buff *skb)
1509{
1510 struct rtable *rt;
1511
1512 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1513
1514 rt = (struct rtable *) skb->dst;
1515 if (rt)
1516 dst_set_expires(&rt->u.dst, 0);
1517}
1518
1519static int ip_rt_bug(struct sk_buff *skb)
1520{
1521 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1522 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1523 skb->dev ? skb->dev->name : "?");
1524 kfree_skb(skb);
1525 return 0;
1526}
1527
1528/*
1529 We do not cache source address of outgoing interface,
1530 because it is used only by IP RR, TS and SRR options,
1531 so that it out of fast path.
1532
1533 BTW remember: "addr" is allowed to be not aligned
1534 in IP options!
1535 */
1536
1537void ip_rt_get_source(u8 *addr, struct rtable *rt)
1538{
Al Viroa61ced52006-09-26 21:27:54 -07001539 __be32 src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001540 struct fib_result res;
1541
1542 if (rt->fl.iif == 0)
1543 src = rt->rt_src;
1544 else if (fib_lookup(&rt->fl, &res) == 0) {
1545 src = FIB_RES_PREFSRC(res);
1546 fib_res_put(&res);
1547 } else
1548 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1549 RT_SCOPE_UNIVERSE);
1550 memcpy(addr, &src, 4);
1551}
1552
1553#ifdef CONFIG_NET_CLS_ROUTE
1554static void set_class_tag(struct rtable *rt, u32 tag)
1555{
1556 if (!(rt->u.dst.tclassid & 0xFFFF))
1557 rt->u.dst.tclassid |= tag & 0xFFFF;
1558 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1559 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1560}
1561#endif
1562
1563static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1564{
1565 struct fib_info *fi = res->fi;
1566
1567 if (fi) {
1568 if (FIB_RES_GW(*res) &&
1569 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1570 rt->rt_gateway = FIB_RES_GW(*res);
1571 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1572 sizeof(rt->u.dst.metrics));
1573 if (fi->fib_mtu == 0) {
1574 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1575 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1576 rt->rt_gateway != rt->rt_dst &&
1577 rt->u.dst.dev->mtu > 576)
1578 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1579 }
1580#ifdef CONFIG_NET_CLS_ROUTE
1581 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1582#endif
1583 } else
1584 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1585
1586 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1587 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1588 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1589 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1590 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1591 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1592 ip_rt_min_advmss);
1593 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1594 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1595
1596#ifdef CONFIG_NET_CLS_ROUTE
1597#ifdef CONFIG_IP_MULTIPLE_TABLES
1598 set_class_tag(rt, fib_rules_tclass(res));
1599#endif
1600 set_class_tag(rt, itag);
1601#endif
1602 rt->rt_type = res->type;
1603}
1604
Al Viro9e12bb22006-09-26 21:25:20 -07001605static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001606 u8 tos, struct net_device *dev, int our)
1607{
1608 unsigned hash;
1609 struct rtable *rth;
Al Viroa61ced52006-09-26 21:27:54 -07001610 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001611 struct in_device *in_dev = in_dev_get(dev);
1612 u32 itag = 0;
1613
1614 /* Primary sanity checks. */
1615
1616 if (in_dev == NULL)
1617 return -EINVAL;
1618
1619 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1620 skb->protocol != htons(ETH_P_IP))
1621 goto e_inval;
1622
1623 if (ZERONET(saddr)) {
1624 if (!LOCAL_MCAST(daddr))
1625 goto e_inval;
1626 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1627 } else if (fib_validate_source(saddr, 0, tos, 0,
1628 dev, &spec_dst, &itag) < 0)
1629 goto e_inval;
1630
1631 rth = dst_alloc(&ipv4_dst_ops);
1632 if (!rth)
1633 goto e_nobufs;
1634
1635 rth->u.dst.output= ip_rt_bug;
1636
1637 atomic_set(&rth->u.dst.__refcnt, 1);
1638 rth->u.dst.flags= DST_HOST;
1639 if (in_dev->cnf.no_policy)
1640 rth->u.dst.flags |= DST_NOPOLICY;
1641 rth->fl.fl4_dst = daddr;
1642 rth->rt_dst = daddr;
1643 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001644 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001645 rth->fl.fl4_src = saddr;
1646 rth->rt_src = saddr;
1647#ifdef CONFIG_NET_CLS_ROUTE
1648 rth->u.dst.tclassid = itag;
1649#endif
1650 rth->rt_iif =
1651 rth->fl.iif = dev->ifindex;
1652 rth->u.dst.dev = &loopback_dev;
1653 dev_hold(rth->u.dst.dev);
1654 rth->idev = in_dev_get(rth->u.dst.dev);
1655 rth->fl.oif = 0;
1656 rth->rt_gateway = daddr;
1657 rth->rt_spec_dst= spec_dst;
1658 rth->rt_type = RTN_MULTICAST;
1659 rth->rt_flags = RTCF_MULTICAST;
1660 if (our) {
1661 rth->u.dst.input= ip_local_deliver;
1662 rth->rt_flags |= RTCF_LOCAL;
1663 }
1664
1665#ifdef CONFIG_IP_MROUTE
1666 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1667 rth->u.dst.input = ip_mr_input;
1668#endif
1669 RT_CACHE_STAT_INC(in_slow_mc);
1670
1671 in_dev_put(in_dev);
Al Viro8c7bc842006-09-26 21:26:19 -07001672 hash = rt_hash(daddr, saddr, dev->ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001673 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1674
1675e_nobufs:
1676 in_dev_put(in_dev);
1677 return -ENOBUFS;
1678
1679e_inval:
1680 in_dev_put(in_dev);
1681 return -EINVAL;
1682}
1683
1684
1685static void ip_handle_martian_source(struct net_device *dev,
1686 struct in_device *in_dev,
1687 struct sk_buff *skb,
Al Viro9e12bb22006-09-26 21:25:20 -07001688 __be32 daddr,
1689 __be32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001690{
1691 RT_CACHE_STAT_INC(in_martian_src);
1692#ifdef CONFIG_IP_ROUTE_VERBOSE
1693 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1694 /*
1695 * RFC1812 recommendation, if source is martian,
1696 * the only hint is MAC header.
1697 */
1698 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1699 "%u.%u.%u.%u, on dev %s\n",
1700 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
Olaf Kirch0b7f22a2005-07-11 21:01:42 -07001701 if (dev->hard_header_len && skb->mac.raw) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001702 int i;
1703 unsigned char *p = skb->mac.raw;
1704 printk(KERN_WARNING "ll header: ");
1705 for (i = 0; i < dev->hard_header_len; i++, p++) {
1706 printk("%02x", *p);
1707 if (i < (dev->hard_header_len - 1))
1708 printk(":");
1709 }
1710 printk("\n");
1711 }
1712 }
1713#endif
1714}
1715
1716static inline int __mkroute_input(struct sk_buff *skb,
1717 struct fib_result* res,
1718 struct in_device *in_dev,
Al Viro9e12bb22006-09-26 21:25:20 -07001719 __be32 daddr, __be32 saddr, u32 tos,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001720 struct rtable **result)
1721{
1722
1723 struct rtable *rth;
1724 int err;
1725 struct in_device *out_dev;
1726 unsigned flags = 0;
Al Virod9c9df82006-09-26 21:28:14 -07001727 __be32 spec_dst;
1728 u32 itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001729
1730 /* get a working reference to the output device */
1731 out_dev = in_dev_get(FIB_RES_DEV(*res));
1732 if (out_dev == NULL) {
1733 if (net_ratelimit())
1734 printk(KERN_CRIT "Bug in ip_route_input" \
1735 "_slow(). Please, report\n");
1736 return -EINVAL;
1737 }
1738
1739
1740 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1741 in_dev->dev, &spec_dst, &itag);
1742 if (err < 0) {
1743 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1744 saddr);
1745
1746 err = -EINVAL;
1747 goto cleanup;
1748 }
1749
1750 if (err)
1751 flags |= RTCF_DIRECTSRC;
1752
1753 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1754 (IN_DEV_SHARED_MEDIA(out_dev) ||
1755 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1756 flags |= RTCF_DOREDIRECT;
1757
1758 if (skb->protocol != htons(ETH_P_IP)) {
1759 /* Not IP (i.e. ARP). Do not create route, if it is
1760 * invalid for proxy arp. DNAT routes are always valid.
1761 */
1762 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1763 err = -EINVAL;
1764 goto cleanup;
1765 }
1766 }
1767
1768
1769 rth = dst_alloc(&ipv4_dst_ops);
1770 if (!rth) {
1771 err = -ENOBUFS;
1772 goto cleanup;
1773 }
1774
Julian Anastasovce723d82005-09-08 13:34:47 -07001775 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001776 rth->u.dst.flags= DST_HOST;
1777#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1778 if (res->fi->fib_nhs > 1)
1779 rth->u.dst.flags |= DST_BALANCED;
1780#endif
1781 if (in_dev->cnf.no_policy)
1782 rth->u.dst.flags |= DST_NOPOLICY;
Patrick McHardy1b6651f2006-12-04 19:59:00 -08001783 if (out_dev->cnf.no_xfrm)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001784 rth->u.dst.flags |= DST_NOXFRM;
1785 rth->fl.fl4_dst = daddr;
1786 rth->rt_dst = daddr;
1787 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001788 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001789 rth->fl.fl4_src = saddr;
1790 rth->rt_src = saddr;
1791 rth->rt_gateway = daddr;
1792 rth->rt_iif =
1793 rth->fl.iif = in_dev->dev->ifindex;
1794 rth->u.dst.dev = (out_dev)->dev;
1795 dev_hold(rth->u.dst.dev);
1796 rth->idev = in_dev_get(rth->u.dst.dev);
1797 rth->fl.oif = 0;
1798 rth->rt_spec_dst= spec_dst;
1799
1800 rth->u.dst.input = ip_forward;
1801 rth->u.dst.output = ip_output;
1802
1803 rt_set_nexthop(rth, res, itag);
1804
1805 rth->rt_flags = flags;
1806
1807 *result = rth;
1808 err = 0;
1809 cleanup:
1810 /* release the working reference to the output device */
1811 in_dev_put(out_dev);
1812 return err;
1813}
1814
1815static inline int ip_mkroute_input_def(struct sk_buff *skb,
1816 struct fib_result* res,
1817 const struct flowi *fl,
1818 struct in_device *in_dev,
Al Viro9e12bb22006-09-26 21:25:20 -07001819 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001820{
Chuck Short7abaa272005-06-22 22:10:23 -07001821 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001822 int err;
1823 unsigned hash;
1824
1825#ifdef CONFIG_IP_ROUTE_MULTIPATH
1826 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1827 fib_select_multipath(fl, res);
1828#endif
1829
1830 /* create a routing cache entry */
1831 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1832 if (err)
1833 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001834
1835 /* put it into the cache */
Al Viro8c7bc842006-09-26 21:26:19 -07001836 hash = rt_hash(daddr, saddr, fl->iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001837 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1838}
1839
1840static inline int ip_mkroute_input(struct sk_buff *skb,
1841 struct fib_result* res,
1842 const struct flowi *fl,
1843 struct in_device *in_dev,
Al Viro9e12bb22006-09-26 21:25:20 -07001844 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001845{
1846#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
Julian Anastasovce723d82005-09-08 13:34:47 -07001847 struct rtable* rth = NULL, *rtres;
1848 unsigned char hop, hopcount;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001849 int err = -EINVAL;
1850 unsigned int hash;
1851
1852 if (res->fi)
1853 hopcount = res->fi->fib_nhs;
1854 else
1855 hopcount = 1;
1856
Linus Torvalds1da177e2005-04-16 15:20:36 -07001857 /* distinguish between multipath and singlepath */
1858 if (hopcount < 2)
1859 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1860 saddr, tos);
1861
1862 /* add all alternatives to the routing cache */
1863 for (hop = 0; hop < hopcount; hop++) {
1864 res->nh_sel = hop;
1865
Julian Anastasovce723d82005-09-08 13:34:47 -07001866 /* put reference to previous result */
1867 if (hop)
1868 ip_rt_put(rtres);
1869
Linus Torvalds1da177e2005-04-16 15:20:36 -07001870 /* create a routing cache entry */
1871 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1872 &rth);
1873 if (err)
1874 return err;
1875
1876 /* put it into the cache */
Al Viro8c7bc842006-09-26 21:26:19 -07001877 hash = rt_hash(daddr, saddr, fl->iif);
Julian Anastasovce723d82005-09-08 13:34:47 -07001878 err = rt_intern_hash(hash, rth, &rtres);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001879 if (err)
1880 return err;
1881
1882 /* forward hop information to multipath impl. */
1883 multipath_set_nhinfo(rth,
1884 FIB_RES_NETWORK(*res),
1885 FIB_RES_NETMASK(*res),
1886 res->prefixlen,
1887 &FIB_RES_NH(*res));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001888 }
Julian Anastasovce723d82005-09-08 13:34:47 -07001889 skb->dst = &rtres->u.dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001890 return err;
1891#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1892 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1893#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1894}
1895
1896
1897/*
1898 * NOTE. We drop all the packets that has local source
1899 * addresses, because every properly looped back packet
1900 * must have correct destination already attached by output routine.
1901 *
1902 * Such approach solves two big problems:
1903 * 1. Not simplex devices are handled properly.
1904 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1905 */
1906
Al Viro9e12bb22006-09-26 21:25:20 -07001907static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001908 u8 tos, struct net_device *dev)
1909{
1910 struct fib_result res;
1911 struct in_device *in_dev = in_dev_get(dev);
1912 struct flowi fl = { .nl_u = { .ip4_u =
1913 { .daddr = daddr,
1914 .saddr = saddr,
1915 .tos = tos,
1916 .scope = RT_SCOPE_UNIVERSE,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001917 } },
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001918 .mark = skb->mark,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001919 .iif = dev->ifindex };
1920 unsigned flags = 0;
1921 u32 itag = 0;
1922 struct rtable * rth;
1923 unsigned hash;
Al Viro9e12bb22006-09-26 21:25:20 -07001924 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001925 int err = -EINVAL;
1926 int free_res = 0;
1927
1928 /* IP on this device is disabled. */
1929
1930 if (!in_dev)
1931 goto out;
1932
1933 /* Check for the most weird martians, which can be not detected
1934 by fib_lookup.
1935 */
1936
1937 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1938 goto martian_source;
1939
Al Viroe4485152006-09-26 22:15:01 -07001940 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001941 goto brd_input;
1942
1943 /* Accept zero addresses only to limited broadcast;
1944 * I even do not know to fix it or not. Waiting for complains :-)
1945 */
1946 if (ZERONET(saddr))
1947 goto martian_source;
1948
1949 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1950 goto martian_destination;
1951
1952 /*
1953 * Now we are ready to route packet.
1954 */
1955 if ((err = fib_lookup(&fl, &res)) != 0) {
1956 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001957 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001958 goto no_route;
1959 }
1960 free_res = 1;
1961
1962 RT_CACHE_STAT_INC(in_slow_tot);
1963
1964 if (res.type == RTN_BROADCAST)
1965 goto brd_input;
1966
1967 if (res.type == RTN_LOCAL) {
1968 int result;
1969 result = fib_validate_source(saddr, daddr, tos,
1970 loopback_dev.ifindex,
1971 dev, &spec_dst, &itag);
1972 if (result < 0)
1973 goto martian_source;
1974 if (result)
1975 flags |= RTCF_DIRECTSRC;
1976 spec_dst = daddr;
1977 goto local_input;
1978 }
1979
1980 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001981 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001982 if (res.type != RTN_UNICAST)
1983 goto martian_destination;
1984
1985 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1986 if (err == -ENOBUFS)
1987 goto e_nobufs;
1988 if (err == -EINVAL)
1989 goto e_inval;
1990
1991done:
1992 in_dev_put(in_dev);
1993 if (free_res)
1994 fib_res_put(&res);
1995out: return err;
1996
1997brd_input:
1998 if (skb->protocol != htons(ETH_P_IP))
1999 goto e_inval;
2000
2001 if (ZERONET(saddr))
2002 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2003 else {
2004 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2005 &itag);
2006 if (err < 0)
2007 goto martian_source;
2008 if (err)
2009 flags |= RTCF_DIRECTSRC;
2010 }
2011 flags |= RTCF_BROADCAST;
2012 res.type = RTN_BROADCAST;
2013 RT_CACHE_STAT_INC(in_brd);
2014
2015local_input:
2016 rth = dst_alloc(&ipv4_dst_ops);
2017 if (!rth)
2018 goto e_nobufs;
2019
2020 rth->u.dst.output= ip_rt_bug;
2021
2022 atomic_set(&rth->u.dst.__refcnt, 1);
2023 rth->u.dst.flags= DST_HOST;
2024 if (in_dev->cnf.no_policy)
2025 rth->u.dst.flags |= DST_NOPOLICY;
2026 rth->fl.fl4_dst = daddr;
2027 rth->rt_dst = daddr;
2028 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002029 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002030 rth->fl.fl4_src = saddr;
2031 rth->rt_src = saddr;
2032#ifdef CONFIG_NET_CLS_ROUTE
2033 rth->u.dst.tclassid = itag;
2034#endif
2035 rth->rt_iif =
2036 rth->fl.iif = dev->ifindex;
2037 rth->u.dst.dev = &loopback_dev;
2038 dev_hold(rth->u.dst.dev);
2039 rth->idev = in_dev_get(rth->u.dst.dev);
2040 rth->rt_gateway = daddr;
2041 rth->rt_spec_dst= spec_dst;
2042 rth->u.dst.input= ip_local_deliver;
2043 rth->rt_flags = flags|RTCF_LOCAL;
2044 if (res.type == RTN_UNREACHABLE) {
2045 rth->u.dst.input= ip_error;
2046 rth->u.dst.error= -err;
2047 rth->rt_flags &= ~RTCF_LOCAL;
2048 }
2049 rth->rt_type = res.type;
Al Viro8c7bc842006-09-26 21:26:19 -07002050 hash = rt_hash(daddr, saddr, fl.iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002051 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2052 goto done;
2053
2054no_route:
2055 RT_CACHE_STAT_INC(in_no_route);
2056 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2057 res.type = RTN_UNREACHABLE;
2058 goto local_input;
2059
2060 /*
2061 * Do not cache martian addresses: they should be logged (RFC1812)
2062 */
2063martian_destination:
2064 RT_CACHE_STAT_INC(in_martian_dst);
2065#ifdef CONFIG_IP_ROUTE_VERBOSE
2066 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2067 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2068 "%u.%u.%u.%u, dev %s\n",
2069 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2070#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002071
2072e_hostunreach:
2073 err = -EHOSTUNREACH;
2074 goto done;
2075
Linus Torvalds1da177e2005-04-16 15:20:36 -07002076e_inval:
2077 err = -EINVAL;
2078 goto done;
2079
2080e_nobufs:
2081 err = -ENOBUFS;
2082 goto done;
2083
2084martian_source:
2085 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2086 goto e_inval;
2087}
2088
Al Viro9e12bb22006-09-26 21:25:20 -07002089int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002090 u8 tos, struct net_device *dev)
2091{
2092 struct rtable * rth;
2093 unsigned hash;
2094 int iif = dev->ifindex;
2095
2096 tos &= IPTOS_RT_MASK;
Al Viro8c7bc842006-09-26 21:26:19 -07002097 hash = rt_hash(daddr, saddr, iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002098
2099 rcu_read_lock();
2100 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2101 rth = rcu_dereference(rth->u.rt_next)) {
2102 if (rth->fl.fl4_dst == daddr &&
2103 rth->fl.fl4_src == saddr &&
2104 rth->fl.iif == iif &&
2105 rth->fl.oif == 0 &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002106 rth->fl.mark == skb->mark &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002107 rth->fl.fl4_tos == tos) {
2108 rth->u.dst.lastuse = jiffies;
2109 dst_hold(&rth->u.dst);
2110 rth->u.dst.__use++;
2111 RT_CACHE_STAT_INC(in_hit);
2112 rcu_read_unlock();
2113 skb->dst = (struct dst_entry*)rth;
2114 return 0;
2115 }
2116 RT_CACHE_STAT_INC(in_hlist_search);
2117 }
2118 rcu_read_unlock();
2119
2120 /* Multicast recognition logic is moved from route cache to here.
2121 The problem was that too many Ethernet cards have broken/missing
2122 hardware multicast filters :-( As result the host on multicasting
2123 network acquires a lot of useless route cache entries, sort of
2124 SDR messages from all the world. Now we try to get rid of them.
2125 Really, provided software IP multicast filter is organized
2126 reasonably (at least, hashed), it does not result in a slowdown
2127 comparing with route cache reject entries.
2128 Note, that multicast routers are not affected, because
2129 route cache entry is created eventually.
2130 */
2131 if (MULTICAST(daddr)) {
2132 struct in_device *in_dev;
2133
2134 rcu_read_lock();
Herbert Xue5ed6392005-10-03 14:35:55 -07002135 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002136 int our = ip_check_mc(in_dev, daddr, saddr,
2137 skb->nh.iph->protocol);
2138 if (our
2139#ifdef CONFIG_IP_MROUTE
2140 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2141#endif
2142 ) {
2143 rcu_read_unlock();
2144 return ip_route_input_mc(skb, daddr, saddr,
2145 tos, dev, our);
2146 }
2147 }
2148 rcu_read_unlock();
2149 return -EINVAL;
2150 }
2151 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2152}
2153
2154static inline int __mkroute_output(struct rtable **result,
2155 struct fib_result* res,
2156 const struct flowi *fl,
2157 const struct flowi *oldflp,
2158 struct net_device *dev_out,
2159 unsigned flags)
2160{
2161 struct rtable *rth;
2162 struct in_device *in_dev;
2163 u32 tos = RT_FL_TOS(oldflp);
2164 int err = 0;
2165
2166 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2167 return -EINVAL;
2168
Al Viroe4485152006-09-26 22:15:01 -07002169 if (fl->fl4_dst == htonl(0xFFFFFFFF))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002170 res->type = RTN_BROADCAST;
2171 else if (MULTICAST(fl->fl4_dst))
2172 res->type = RTN_MULTICAST;
2173 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2174 return -EINVAL;
2175
2176 if (dev_out->flags & IFF_LOOPBACK)
2177 flags |= RTCF_LOCAL;
2178
2179 /* get work reference to inet device */
2180 in_dev = in_dev_get(dev_out);
2181 if (!in_dev)
2182 return -EINVAL;
2183
2184 if (res->type == RTN_BROADCAST) {
2185 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2186 if (res->fi) {
2187 fib_info_put(res->fi);
2188 res->fi = NULL;
2189 }
2190 } else if (res->type == RTN_MULTICAST) {
2191 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2192 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2193 oldflp->proto))
2194 flags &= ~RTCF_LOCAL;
2195 /* If multicast route do not exist use
2196 default one, but do not gateway in this case.
2197 Yes, it is hack.
2198 */
2199 if (res->fi && res->prefixlen < 4) {
2200 fib_info_put(res->fi);
2201 res->fi = NULL;
2202 }
2203 }
2204
2205
2206 rth = dst_alloc(&ipv4_dst_ops);
2207 if (!rth) {
2208 err = -ENOBUFS;
2209 goto cleanup;
2210 }
2211
Julian Anastasovce723d82005-09-08 13:34:47 -07002212 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002213 rth->u.dst.flags= DST_HOST;
2214#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2215 if (res->fi) {
2216 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2217 if (res->fi->fib_nhs > 1)
2218 rth->u.dst.flags |= DST_BALANCED;
2219 }
2220#endif
2221 if (in_dev->cnf.no_xfrm)
2222 rth->u.dst.flags |= DST_NOXFRM;
2223 if (in_dev->cnf.no_policy)
2224 rth->u.dst.flags |= DST_NOPOLICY;
2225
2226 rth->fl.fl4_dst = oldflp->fl4_dst;
2227 rth->fl.fl4_tos = tos;
2228 rth->fl.fl4_src = oldflp->fl4_src;
2229 rth->fl.oif = oldflp->oif;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002230 rth->fl.mark = oldflp->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002231 rth->rt_dst = fl->fl4_dst;
2232 rth->rt_src = fl->fl4_src;
2233 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2234 /* get references to the devices that are to be hold by the routing
2235 cache entry */
2236 rth->u.dst.dev = dev_out;
2237 dev_hold(dev_out);
2238 rth->idev = in_dev_get(dev_out);
2239 rth->rt_gateway = fl->fl4_dst;
2240 rth->rt_spec_dst= fl->fl4_src;
2241
2242 rth->u.dst.output=ip_output;
2243
2244 RT_CACHE_STAT_INC(out_slow_tot);
2245
2246 if (flags & RTCF_LOCAL) {
2247 rth->u.dst.input = ip_local_deliver;
2248 rth->rt_spec_dst = fl->fl4_dst;
2249 }
2250 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2251 rth->rt_spec_dst = fl->fl4_src;
2252 if (flags & RTCF_LOCAL &&
2253 !(dev_out->flags & IFF_LOOPBACK)) {
2254 rth->u.dst.output = ip_mc_output;
2255 RT_CACHE_STAT_INC(out_slow_mc);
2256 }
2257#ifdef CONFIG_IP_MROUTE
2258 if (res->type == RTN_MULTICAST) {
2259 if (IN_DEV_MFORWARD(in_dev) &&
2260 !LOCAL_MCAST(oldflp->fl4_dst)) {
2261 rth->u.dst.input = ip_mr_input;
2262 rth->u.dst.output = ip_mc_output;
2263 }
2264 }
2265#endif
2266 }
2267
2268 rt_set_nexthop(rth, res, 0);
2269
2270 rth->rt_flags = flags;
2271
2272 *result = rth;
2273 cleanup:
2274 /* release work reference to inet device */
2275 in_dev_put(in_dev);
2276
2277 return err;
2278}
2279
2280static inline int ip_mkroute_output_def(struct rtable **rp,
2281 struct fib_result* res,
2282 const struct flowi *fl,
2283 const struct flowi *oldflp,
2284 struct net_device *dev_out,
2285 unsigned flags)
2286{
Chuck Short7abaa272005-06-22 22:10:23 -07002287 struct rtable *rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002288 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2289 unsigned hash;
2290 if (err == 0) {
Al Viro8c7bc842006-09-26 21:26:19 -07002291 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002292 err = rt_intern_hash(hash, rth, rp);
2293 }
2294
2295 return err;
2296}
2297
2298static inline int ip_mkroute_output(struct rtable** rp,
2299 struct fib_result* res,
2300 const struct flowi *fl,
2301 const struct flowi *oldflp,
2302 struct net_device *dev_out,
2303 unsigned flags)
2304{
2305#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
Linus Torvalds1da177e2005-04-16 15:20:36 -07002306 unsigned char hop;
2307 unsigned hash;
2308 int err = -EINVAL;
Chuck Short7abaa272005-06-22 22:10:23 -07002309 struct rtable *rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002310
2311 if (res->fi && res->fi->fib_nhs > 1) {
2312 unsigned char hopcount = res->fi->fib_nhs;
2313
2314 for (hop = 0; hop < hopcount; hop++) {
2315 struct net_device *dev2nexthop;
2316
2317 res->nh_sel = hop;
2318
2319 /* hold a work reference to the output device */
2320 dev2nexthop = FIB_RES_DEV(*res);
2321 dev_hold(dev2nexthop);
2322
Julian Anastasovce723d82005-09-08 13:34:47 -07002323 /* put reference to previous result */
2324 if (hop)
2325 ip_rt_put(*rp);
2326
Linus Torvalds1da177e2005-04-16 15:20:36 -07002327 err = __mkroute_output(&rth, res, fl, oldflp,
2328 dev2nexthop, flags);
2329
2330 if (err != 0)
2331 goto cleanup;
2332
Al Viro8c7bc842006-09-26 21:26:19 -07002333 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src,
2334 oldflp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002335 err = rt_intern_hash(hash, rth, rp);
2336
2337 /* forward hop information to multipath impl. */
2338 multipath_set_nhinfo(rth,
2339 FIB_RES_NETWORK(*res),
2340 FIB_RES_NETMASK(*res),
2341 res->prefixlen,
2342 &FIB_RES_NH(*res));
2343 cleanup:
2344 /* release work reference to output device */
2345 dev_put(dev2nexthop);
2346
2347 if (err != 0)
2348 return err;
2349 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002350 return err;
2351 } else {
2352 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2353 flags);
2354 }
2355#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2356 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2357#endif
2358}
2359
2360/*
2361 * Major route resolver routine.
2362 */
2363
2364static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2365{
2366 u32 tos = RT_FL_TOS(oldflp);
2367 struct flowi fl = { .nl_u = { .ip4_u =
2368 { .daddr = oldflp->fl4_dst,
2369 .saddr = oldflp->fl4_src,
2370 .tos = tos & IPTOS_RT_MASK,
2371 .scope = ((tos & RTO_ONLINK) ?
2372 RT_SCOPE_LINK :
2373 RT_SCOPE_UNIVERSE),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002374 } },
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002375 .mark = oldflp->mark,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002376 .iif = loopback_dev.ifindex,
2377 .oif = oldflp->oif };
2378 struct fib_result res;
2379 unsigned flags = 0;
2380 struct net_device *dev_out = NULL;
2381 int free_res = 0;
2382 int err;
2383
2384
2385 res.fi = NULL;
2386#ifdef CONFIG_IP_MULTIPLE_TABLES
2387 res.r = NULL;
2388#endif
2389
2390 if (oldflp->fl4_src) {
2391 err = -EINVAL;
2392 if (MULTICAST(oldflp->fl4_src) ||
2393 BADCLASS(oldflp->fl4_src) ||
2394 ZERONET(oldflp->fl4_src))
2395 goto out;
2396
2397 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2398 dev_out = ip_dev_find(oldflp->fl4_src);
2399 if (dev_out == NULL)
2400 goto out;
2401
2402 /* I removed check for oif == dev_out->oif here.
2403 It was wrong for two reasons:
2404 1. ip_dev_find(saddr) can return wrong iface, if saddr is
2405 assigned to multiple interfaces.
2406 2. Moreover, we are allowed to send packets with saddr
2407 of another iface. --ANK
2408 */
2409
2410 if (oldflp->oif == 0
Al Viroe4485152006-09-26 22:15:01 -07002411 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002412 /* Special hack: user can direct multicasts
2413 and limited broadcast via necessary interface
2414 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2415 This hack is not just for fun, it allows
2416 vic,vat and friends to work.
2417 They bind socket to loopback, set ttl to zero
2418 and expect that it will work.
2419 From the viewpoint of routing cache they are broken,
2420 because we are not allowed to build multicast path
2421 with loopback source addr (look, routing cache
2422 cannot know, that ttl is zero, so that packet
2423 will not leave this host and route is valid).
2424 Luckily, this hack is good workaround.
2425 */
2426
2427 fl.oif = dev_out->ifindex;
2428 goto make_route;
2429 }
2430 if (dev_out)
2431 dev_put(dev_out);
2432 dev_out = NULL;
2433 }
2434
2435
2436 if (oldflp->oif) {
2437 dev_out = dev_get_by_index(oldflp->oif);
2438 err = -ENODEV;
2439 if (dev_out == NULL)
2440 goto out;
Herbert Xue5ed6392005-10-03 14:35:55 -07002441
2442 /* RACE: Check return value of inet_select_addr instead. */
2443 if (__in_dev_get_rtnl(dev_out) == NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002444 dev_put(dev_out);
2445 goto out; /* Wrong error code */
2446 }
2447
Al Viroe4485152006-09-26 22:15:01 -07002448 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002449 if (!fl.fl4_src)
2450 fl.fl4_src = inet_select_addr(dev_out, 0,
2451 RT_SCOPE_LINK);
2452 goto make_route;
2453 }
2454 if (!fl.fl4_src) {
2455 if (MULTICAST(oldflp->fl4_dst))
2456 fl.fl4_src = inet_select_addr(dev_out, 0,
2457 fl.fl4_scope);
2458 else if (!oldflp->fl4_dst)
2459 fl.fl4_src = inet_select_addr(dev_out, 0,
2460 RT_SCOPE_HOST);
2461 }
2462 }
2463
2464 if (!fl.fl4_dst) {
2465 fl.fl4_dst = fl.fl4_src;
2466 if (!fl.fl4_dst)
2467 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2468 if (dev_out)
2469 dev_put(dev_out);
2470 dev_out = &loopback_dev;
2471 dev_hold(dev_out);
2472 fl.oif = loopback_dev.ifindex;
2473 res.type = RTN_LOCAL;
2474 flags |= RTCF_LOCAL;
2475 goto make_route;
2476 }
2477
2478 if (fib_lookup(&fl, &res)) {
2479 res.fi = NULL;
2480 if (oldflp->oif) {
2481 /* Apparently, routing tables are wrong. Assume,
2482 that the destination is on link.
2483
2484 WHY? DW.
2485 Because we are allowed to send to iface
2486 even if it has NO routes and NO assigned
2487 addresses. When oif is specified, routing
2488 tables are looked up with only one purpose:
2489 to catch if destination is gatewayed, rather than
2490 direct. Moreover, if MSG_DONTROUTE is set,
2491 we send packet, ignoring both routing tables
2492 and ifaddr state. --ANK
2493
2494
2495 We could make it even if oif is unknown,
2496 likely IPv6, but we do not.
2497 */
2498
2499 if (fl.fl4_src == 0)
2500 fl.fl4_src = inet_select_addr(dev_out, 0,
2501 RT_SCOPE_LINK);
2502 res.type = RTN_UNICAST;
2503 goto make_route;
2504 }
2505 if (dev_out)
2506 dev_put(dev_out);
2507 err = -ENETUNREACH;
2508 goto out;
2509 }
2510 free_res = 1;
2511
2512 if (res.type == RTN_LOCAL) {
2513 if (!fl.fl4_src)
2514 fl.fl4_src = fl.fl4_dst;
2515 if (dev_out)
2516 dev_put(dev_out);
2517 dev_out = &loopback_dev;
2518 dev_hold(dev_out);
2519 fl.oif = dev_out->ifindex;
2520 if (res.fi)
2521 fib_info_put(res.fi);
2522 res.fi = NULL;
2523 flags |= RTCF_LOCAL;
2524 goto make_route;
2525 }
2526
2527#ifdef CONFIG_IP_ROUTE_MULTIPATH
2528 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2529 fib_select_multipath(&fl, &res);
2530 else
2531#endif
2532 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2533 fib_select_default(&fl, &res);
2534
2535 if (!fl.fl4_src)
2536 fl.fl4_src = FIB_RES_PREFSRC(res);
2537
2538 if (dev_out)
2539 dev_put(dev_out);
2540 dev_out = FIB_RES_DEV(res);
2541 dev_hold(dev_out);
2542 fl.oif = dev_out->ifindex;
2543
2544
2545make_route:
2546 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2547
2548
2549 if (free_res)
2550 fib_res_put(&res);
2551 if (dev_out)
2552 dev_put(dev_out);
2553out: return err;
2554}
2555
2556int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2557{
2558 unsigned hash;
2559 struct rtable *rth;
2560
Al Viro8c7bc842006-09-26 21:26:19 -07002561 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002562
2563 rcu_read_lock_bh();
2564 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2565 rth = rcu_dereference(rth->u.rt_next)) {
2566 if (rth->fl.fl4_dst == flp->fl4_dst &&
2567 rth->fl.fl4_src == flp->fl4_src &&
2568 rth->fl.iif == 0 &&
2569 rth->fl.oif == flp->oif &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002570 rth->fl.mark == flp->mark &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002571 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2572 (IPTOS_RT_MASK | RTO_ONLINK))) {
2573
2574 /* check for multipath routes and choose one if
2575 * necessary
2576 */
2577 if (multipath_select_route(flp, rth, rp)) {
2578 dst_hold(&(*rp)->u.dst);
2579 RT_CACHE_STAT_INC(out_hit);
2580 rcu_read_unlock_bh();
2581 return 0;
2582 }
2583
2584 rth->u.dst.lastuse = jiffies;
2585 dst_hold(&rth->u.dst);
2586 rth->u.dst.__use++;
2587 RT_CACHE_STAT_INC(out_hit);
2588 rcu_read_unlock_bh();
2589 *rp = rth;
2590 return 0;
2591 }
2592 RT_CACHE_STAT_INC(out_hlist_search);
2593 }
2594 rcu_read_unlock_bh();
2595
2596 return ip_route_output_slow(rp, flp);
2597}
2598
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002599EXPORT_SYMBOL_GPL(__ip_route_output_key);
2600
Linus Torvalds1da177e2005-04-16 15:20:36 -07002601int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2602{
2603 int err;
2604
2605 if ((err = __ip_route_output_key(rp, flp)) != 0)
2606 return err;
2607
2608 if (flp->proto) {
2609 if (!flp->fl4_src)
2610 flp->fl4_src = (*rp)->rt_src;
2611 if (!flp->fl4_dst)
2612 flp->fl4_dst = (*rp)->rt_dst;
2613 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2614 }
2615
2616 return 0;
2617}
2618
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002619EXPORT_SYMBOL_GPL(ip_route_output_flow);
2620
Linus Torvalds1da177e2005-04-16 15:20:36 -07002621int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2622{
2623 return ip_route_output_flow(rp, flp, NULL, 0);
2624}
2625
2626static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002627 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002628{
2629 struct rtable *rt = (struct rtable*)skb->dst;
2630 struct rtmsg *r;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002631 struct nlmsghdr *nlh;
Thomas Grafe3703b32006-11-27 09:27:07 -08002632 long expires;
2633 u32 id = 0, ts = 0, tsage = 0, error;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002634
2635 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2636 if (nlh == NULL)
2637 return -ENOBUFS;
2638
2639 r = nlmsg_data(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002640 r->rtm_family = AF_INET;
2641 r->rtm_dst_len = 32;
2642 r->rtm_src_len = 0;
2643 r->rtm_tos = rt->fl.fl4_tos;
2644 r->rtm_table = RT_TABLE_MAIN;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002645 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002646 r->rtm_type = rt->rt_type;
2647 r->rtm_scope = RT_SCOPE_UNIVERSE;
2648 r->rtm_protocol = RTPROT_UNSPEC;
2649 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2650 if (rt->rt_flags & RTCF_NOTIFY)
2651 r->rtm_flags |= RTM_F_NOTIFY;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002652
Al Viro17fb2c62006-09-26 22:15:25 -07002653 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002654
Linus Torvalds1da177e2005-04-16 15:20:36 -07002655 if (rt->fl.fl4_src) {
2656 r->rtm_src_len = 32;
Al Viro17fb2c62006-09-26 22:15:25 -07002657 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002658 }
2659 if (rt->u.dst.dev)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002660 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002661#ifdef CONFIG_NET_CLS_ROUTE
2662 if (rt->u.dst.tclassid)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002663 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002664#endif
2665#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
Thomas Grafbe403ea2006-08-17 18:15:17 -07002666 if (rt->rt_multipath_alg != IP_MP_ALG_NONE)
2667 NLA_PUT_U32(skb, RTA_MP_ALGO, rt->rt_multipath_alg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002668#endif
2669 if (rt->fl.iif)
Al Viro17fb2c62006-09-26 22:15:25 -07002670 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002671 else if (rt->rt_src != rt->fl.fl4_src)
Al Viro17fb2c62006-09-26 22:15:25 -07002672 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002673
Linus Torvalds1da177e2005-04-16 15:20:36 -07002674 if (rt->rt_dst != rt->rt_gateway)
Al Viro17fb2c62006-09-26 22:15:25 -07002675 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002676
Linus Torvalds1da177e2005-04-16 15:20:36 -07002677 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002678 goto nla_put_failure;
2679
Thomas Grafe3703b32006-11-27 09:27:07 -08002680 error = rt->u.dst.error;
2681 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002682 if (rt->peer) {
Thomas Grafe3703b32006-11-27 09:27:07 -08002683 id = rt->peer->ip_id_count;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002684 if (rt->peer->tcp_ts_stamp) {
Thomas Grafe3703b32006-11-27 09:27:07 -08002685 ts = rt->peer->tcp_ts;
2686 tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002687 }
2688 }
Thomas Grafbe403ea2006-08-17 18:15:17 -07002689
Linus Torvalds1da177e2005-04-16 15:20:36 -07002690 if (rt->fl.iif) {
2691#ifdef CONFIG_IP_MROUTE
Al Viroe4485152006-09-26 22:15:01 -07002692 __be32 dst = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002693
2694 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2695 ipv4_devconf.mc_forwarding) {
2696 int err = ipmr_get_route(skb, r, nowait);
2697 if (err <= 0) {
2698 if (!nowait) {
2699 if (err == 0)
2700 return 0;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002701 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002702 } else {
2703 if (err == -EMSGSIZE)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002704 goto nla_put_failure;
Thomas Grafe3703b32006-11-27 09:27:07 -08002705 error = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002706 }
2707 }
2708 } else
2709#endif
Thomas Grafbe403ea2006-08-17 18:15:17 -07002710 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002711 }
2712
Thomas Grafe3703b32006-11-27 09:27:07 -08002713 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2714 expires, error) < 0)
2715 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002716
Thomas Grafbe403ea2006-08-17 18:15:17 -07002717 return nlmsg_end(skb, nlh);
2718
2719nla_put_failure:
2720 return nlmsg_cancel(skb, nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002721}
2722
2723int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2724{
Thomas Grafd889ce32006-08-17 18:15:44 -07002725 struct rtmsg *rtm;
2726 struct nlattr *tb[RTA_MAX+1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07002727 struct rtable *rt = NULL;
Al Viro9e12bb22006-09-26 21:25:20 -07002728 __be32 dst = 0;
2729 __be32 src = 0;
2730 u32 iif;
Thomas Grafd889ce32006-08-17 18:15:44 -07002731 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002732 struct sk_buff *skb;
2733
Thomas Grafd889ce32006-08-17 18:15:44 -07002734 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2735 if (err < 0)
2736 goto errout;
2737
2738 rtm = nlmsg_data(nlh);
2739
Linus Torvalds1da177e2005-04-16 15:20:36 -07002740 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
Thomas Grafd889ce32006-08-17 18:15:44 -07002741 if (skb == NULL) {
2742 err = -ENOBUFS;
2743 goto errout;
2744 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002745
2746 /* Reserve room for dummy headers, this skb can pass
2747 through good chunk of routing engine.
2748 */
Stephen Hemmingerd2c962b2006-04-17 17:27:11 -07002749 skb->mac.raw = skb->nh.raw = skb->data;
2750
2751 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2752 skb->nh.iph->protocol = IPPROTO_ICMP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002753 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2754
Al Viro17fb2c62006-09-26 22:15:25 -07002755 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2756 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
Thomas Grafd889ce32006-08-17 18:15:44 -07002757 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002758
2759 if (iif) {
Thomas Grafd889ce32006-08-17 18:15:44 -07002760 struct net_device *dev;
2761
2762 dev = __dev_get_by_index(iif);
2763 if (dev == NULL) {
2764 err = -ENODEV;
2765 goto errout_free;
2766 }
2767
Linus Torvalds1da177e2005-04-16 15:20:36 -07002768 skb->protocol = htons(ETH_P_IP);
2769 skb->dev = dev;
2770 local_bh_disable();
2771 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2772 local_bh_enable();
Thomas Grafd889ce32006-08-17 18:15:44 -07002773
2774 rt = (struct rtable*) skb->dst;
2775 if (err == 0 && rt->u.dst.error)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002776 err = -rt->u.dst.error;
2777 } else {
Thomas Grafd889ce32006-08-17 18:15:44 -07002778 struct flowi fl = {
2779 .nl_u = {
2780 .ip4_u = {
2781 .daddr = dst,
2782 .saddr = src,
2783 .tos = rtm->rtm_tos,
2784 },
2785 },
2786 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2787 };
Linus Torvalds1da177e2005-04-16 15:20:36 -07002788 err = ip_route_output_key(&rt, &fl);
2789 }
Thomas Grafd889ce32006-08-17 18:15:44 -07002790
Linus Torvalds1da177e2005-04-16 15:20:36 -07002791 if (err)
Thomas Grafd889ce32006-08-17 18:15:44 -07002792 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002793
2794 skb->dst = &rt->u.dst;
2795 if (rtm->rtm_flags & RTM_F_NOTIFY)
2796 rt->rt_flags |= RTCF_NOTIFY;
2797
Linus Torvalds1da177e2005-04-16 15:20:36 -07002798 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002799 RTM_NEWROUTE, 0, 0);
Thomas Grafd889ce32006-08-17 18:15:44 -07002800 if (err <= 0)
2801 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002802
Thomas Graf2942e902006-08-15 00:30:25 -07002803 err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
Thomas Grafd889ce32006-08-17 18:15:44 -07002804errout:
Thomas Graf2942e902006-08-15 00:30:25 -07002805 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002806
Thomas Grafd889ce32006-08-17 18:15:44 -07002807errout_free:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002808 kfree_skb(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07002809 goto errout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002810}
2811
2812int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2813{
2814 struct rtable *rt;
2815 int h, s_h;
2816 int idx, s_idx;
2817
2818 s_h = cb->args[0];
2819 s_idx = idx = cb->args[1];
2820 for (h = 0; h <= rt_hash_mask; h++) {
2821 if (h < s_h) continue;
2822 if (h > s_h)
2823 s_idx = 0;
2824 rcu_read_lock_bh();
2825 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2826 rt = rcu_dereference(rt->u.rt_next), idx++) {
2827 if (idx < s_idx)
2828 continue;
2829 skb->dst = dst_clone(&rt->u.dst);
2830 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002831 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2832 1, NLM_F_MULTI) <= 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002833 dst_release(xchg(&skb->dst, NULL));
2834 rcu_read_unlock_bh();
2835 goto done;
2836 }
2837 dst_release(xchg(&skb->dst, NULL));
2838 }
2839 rcu_read_unlock_bh();
2840 }
2841
2842done:
2843 cb->args[0] = h;
2844 cb->args[1] = idx;
2845 return skb->len;
2846}
2847
2848void ip_rt_multicast_event(struct in_device *in_dev)
2849{
2850 rt_cache_flush(0);
2851}
2852
2853#ifdef CONFIG_SYSCTL
2854static int flush_delay;
2855
2856static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2857 struct file *filp, void __user *buffer,
2858 size_t *lenp, loff_t *ppos)
2859{
2860 if (write) {
2861 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2862 rt_cache_flush(flush_delay);
2863 return 0;
2864 }
2865
2866 return -EINVAL;
2867}
2868
2869static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2870 int __user *name,
2871 int nlen,
2872 void __user *oldval,
2873 size_t __user *oldlenp,
2874 void __user *newval,
2875 size_t newlen,
2876 void **context)
2877{
2878 int delay;
2879 if (newlen != sizeof(int))
2880 return -EINVAL;
2881 if (get_user(delay, (int __user *)newval))
2882 return -EFAULT;
2883 rt_cache_flush(delay);
2884 return 0;
2885}
2886
2887ctl_table ipv4_route_table[] = {
2888 {
2889 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2890 .procname = "flush",
2891 .data = &flush_delay,
2892 .maxlen = sizeof(int),
Dave Jones7e3e0362005-04-28 12:11:03 -07002893 .mode = 0200,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002894 .proc_handler = &ipv4_sysctl_rtcache_flush,
2895 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2896 },
2897 {
2898 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2899 .procname = "min_delay",
2900 .data = &ip_rt_min_delay,
2901 .maxlen = sizeof(int),
2902 .mode = 0644,
2903 .proc_handler = &proc_dointvec_jiffies,
2904 .strategy = &sysctl_jiffies,
2905 },
2906 {
2907 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2908 .procname = "max_delay",
2909 .data = &ip_rt_max_delay,
2910 .maxlen = sizeof(int),
2911 .mode = 0644,
2912 .proc_handler = &proc_dointvec_jiffies,
2913 .strategy = &sysctl_jiffies,
2914 },
2915 {
2916 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2917 .procname = "gc_thresh",
2918 .data = &ipv4_dst_ops.gc_thresh,
2919 .maxlen = sizeof(int),
2920 .mode = 0644,
2921 .proc_handler = &proc_dointvec,
2922 },
2923 {
2924 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2925 .procname = "max_size",
2926 .data = &ip_rt_max_size,
2927 .maxlen = sizeof(int),
2928 .mode = 0644,
2929 .proc_handler = &proc_dointvec,
2930 },
2931 {
2932 /* Deprecated. Use gc_min_interval_ms */
2933
2934 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2935 .procname = "gc_min_interval",
2936 .data = &ip_rt_gc_min_interval,
2937 .maxlen = sizeof(int),
2938 .mode = 0644,
2939 .proc_handler = &proc_dointvec_jiffies,
2940 .strategy = &sysctl_jiffies,
2941 },
2942 {
2943 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2944 .procname = "gc_min_interval_ms",
2945 .data = &ip_rt_gc_min_interval,
2946 .maxlen = sizeof(int),
2947 .mode = 0644,
2948 .proc_handler = &proc_dointvec_ms_jiffies,
2949 .strategy = &sysctl_ms_jiffies,
2950 },
2951 {
2952 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2953 .procname = "gc_timeout",
2954 .data = &ip_rt_gc_timeout,
2955 .maxlen = sizeof(int),
2956 .mode = 0644,
2957 .proc_handler = &proc_dointvec_jiffies,
2958 .strategy = &sysctl_jiffies,
2959 },
2960 {
2961 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2962 .procname = "gc_interval",
2963 .data = &ip_rt_gc_interval,
2964 .maxlen = sizeof(int),
2965 .mode = 0644,
2966 .proc_handler = &proc_dointvec_jiffies,
2967 .strategy = &sysctl_jiffies,
2968 },
2969 {
2970 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2971 .procname = "redirect_load",
2972 .data = &ip_rt_redirect_load,
2973 .maxlen = sizeof(int),
2974 .mode = 0644,
2975 .proc_handler = &proc_dointvec,
2976 },
2977 {
2978 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2979 .procname = "redirect_number",
2980 .data = &ip_rt_redirect_number,
2981 .maxlen = sizeof(int),
2982 .mode = 0644,
2983 .proc_handler = &proc_dointvec,
2984 },
2985 {
2986 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2987 .procname = "redirect_silence",
2988 .data = &ip_rt_redirect_silence,
2989 .maxlen = sizeof(int),
2990 .mode = 0644,
2991 .proc_handler = &proc_dointvec,
2992 },
2993 {
2994 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2995 .procname = "error_cost",
2996 .data = &ip_rt_error_cost,
2997 .maxlen = sizeof(int),
2998 .mode = 0644,
2999 .proc_handler = &proc_dointvec,
3000 },
3001 {
3002 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
3003 .procname = "error_burst",
3004 .data = &ip_rt_error_burst,
3005 .maxlen = sizeof(int),
3006 .mode = 0644,
3007 .proc_handler = &proc_dointvec,
3008 },
3009 {
3010 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
3011 .procname = "gc_elasticity",
3012 .data = &ip_rt_gc_elasticity,
3013 .maxlen = sizeof(int),
3014 .mode = 0644,
3015 .proc_handler = &proc_dointvec,
3016 },
3017 {
3018 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
3019 .procname = "mtu_expires",
3020 .data = &ip_rt_mtu_expires,
3021 .maxlen = sizeof(int),
3022 .mode = 0644,
3023 .proc_handler = &proc_dointvec_jiffies,
3024 .strategy = &sysctl_jiffies,
3025 },
3026 {
3027 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
3028 .procname = "min_pmtu",
3029 .data = &ip_rt_min_pmtu,
3030 .maxlen = sizeof(int),
3031 .mode = 0644,
3032 .proc_handler = &proc_dointvec,
3033 },
3034 {
3035 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
3036 .procname = "min_adv_mss",
3037 .data = &ip_rt_min_advmss,
3038 .maxlen = sizeof(int),
3039 .mode = 0644,
3040 .proc_handler = &proc_dointvec,
3041 },
3042 {
3043 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3044 .procname = "secret_interval",
3045 .data = &ip_rt_secret_interval,
3046 .maxlen = sizeof(int),
3047 .mode = 0644,
3048 .proc_handler = &proc_dointvec_jiffies,
3049 .strategy = &sysctl_jiffies,
3050 },
3051 { .ctl_name = 0 }
3052};
3053#endif
3054
3055#ifdef CONFIG_NET_CLS_ROUTE
3056struct ip_rt_acct *ip_rt_acct;
3057
3058/* This code sucks. But you should have seen it before! --RR */
3059
3060/* IP route accounting ptr for this logical cpu number. */
3061#define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3062
3063#ifdef CONFIG_PROC_FS
3064static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3065 int length, int *eof, void *data)
3066{
3067 unsigned int i;
3068
3069 if ((offset & 3) || (length & 3))
3070 return -EIO;
3071
3072 if (offset >= sizeof(struct ip_rt_acct) * 256) {
3073 *eof = 1;
3074 return 0;
3075 }
3076
3077 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3078 length = sizeof(struct ip_rt_acct) * 256 - offset;
3079 *eof = 1;
3080 }
3081
3082 offset /= sizeof(u32);
3083
3084 if (length > 0) {
3085 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3086 u32 *dst = (u32 *) buffer;
3087
3088 /* Copy first cpu. */
3089 *start = buffer;
3090 memcpy(dst, src, length);
3091
3092 /* Add the other cpus in, one int at a time */
KAMEZAWA Hiroyuki6f912042006-04-10 22:52:50 -07003093 for_each_possible_cpu(i) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003094 unsigned int j;
3095
3096 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3097
3098 for (j = 0; j < length/4; j++)
3099 dst[j] += src[j];
3100 }
3101 }
3102 return length;
3103}
3104#endif /* CONFIG_PROC_FS */
3105#endif /* CONFIG_NET_CLS_ROUTE */
3106
3107static __initdata unsigned long rhash_entries;
3108static int __init set_rhash_entries(char *str)
3109{
3110 if (!str)
3111 return 0;
3112 rhash_entries = simple_strtoul(str, &str, 0);
3113 return 1;
3114}
3115__setup("rhash_entries=", set_rhash_entries);
3116
3117int __init ip_rt_init(void)
3118{
Eric Dumazet424c4b72005-07-05 14:58:19 -07003119 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003120
3121 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3122 (jiffies ^ (jiffies >> 7)));
3123
3124#ifdef CONFIG_NET_CLS_ROUTE
Eric Dumazet424c4b72005-07-05 14:58:19 -07003125 {
3126 int order;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003127 for (order = 0;
3128 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3129 /* NOTHING */;
3130 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3131 if (!ip_rt_acct)
3132 panic("IP: failed to allocate ip_rt_acct\n");
3133 memset(ip_rt_acct, 0, PAGE_SIZE << order);
Eric Dumazet424c4b72005-07-05 14:58:19 -07003134 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003135#endif
3136
Alexey Dobriyane5d679f332006-08-26 19:25:52 -07003137 ipv4_dst_ops.kmem_cachep =
3138 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3139 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003140
Eric Dumazet424c4b72005-07-05 14:58:19 -07003141 rt_hash_table = (struct rt_hash_bucket *)
3142 alloc_large_system_hash("IP route cache",
3143 sizeof(struct rt_hash_bucket),
3144 rhash_entries,
3145 (num_physpages >= 128 * 1024) ?
Mike Stroyan18955cf2005-11-29 16:12:55 -08003146 15 : 17,
Kirill Korotaev8d1502d2006-08-07 20:44:22 -07003147 0,
Eric Dumazet424c4b72005-07-05 14:58:19 -07003148 &rt_hash_log,
3149 &rt_hash_mask,
3150 0);
Eric Dumazet22c047c2005-07-05 14:55:24 -07003151 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3152 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003153
3154 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3155 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3156
Linus Torvalds1da177e2005-04-16 15:20:36 -07003157 devinet_init();
3158 ip_fib_init();
3159
3160 init_timer(&rt_flush_timer);
3161 rt_flush_timer.function = rt_run_flush;
3162 init_timer(&rt_periodic_timer);
3163 rt_periodic_timer.function = rt_check_expire;
3164 init_timer(&rt_secret_timer);
3165 rt_secret_timer.function = rt_secret_rebuild;
3166
3167 /* All the timers, started at system startup tend
3168 to synchronize. Perturb it a bit.
3169 */
3170 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3171 ip_rt_gc_interval;
3172 add_timer(&rt_periodic_timer);
3173
3174 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3175 ip_rt_secret_interval;
3176 add_timer(&rt_secret_timer);
3177
3178#ifdef CONFIG_PROC_FS
3179 {
3180 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3181 if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3182 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3183 proc_net_stat))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003184 return -ENOMEM;
3185 }
3186 rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3187 }
3188#ifdef CONFIG_NET_CLS_ROUTE
3189 create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3190#endif
3191#endif
3192#ifdef CONFIG_XFRM
3193 xfrm_init();
3194 xfrm4_init();
3195#endif
3196 return rc;
3197}
3198
3199EXPORT_SYMBOL(__ip_select_ident);
3200EXPORT_SYMBOL(ip_route_input);
3201EXPORT_SYMBOL(ip_route_output_key);