blob: d675ff80b04d98910d083cd9e1dea8c754d550e2 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9 *
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 *
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
41 *
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070057 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Linus Torvalds1da177e2005-04-16 15:20:36 -070058 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
65#include <linux/config.h>
66#include <linux/module.h>
67#include <asm/uaccess.h>
68#include <asm/system.h>
69#include <linux/bitops.h>
70#include <linux/types.h>
71#include <linux/kernel.h>
72#include <linux/sched.h>
73#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070074#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070075#include <linux/string.h>
76#include <linux/socket.h>
77#include <linux/sockios.h>
78#include <linux/errno.h>
79#include <linux/in.h>
80#include <linux/inet.h>
81#include <linux/netdevice.h>
82#include <linux/proc_fs.h>
83#include <linux/init.h>
84#include <linux/skbuff.h>
85#include <linux/rtnetlink.h>
86#include <linux/inetdevice.h>
87#include <linux/igmp.h>
88#include <linux/pkt_sched.h>
89#include <linux/mroute.h>
90#include <linux/netfilter_ipv4.h>
91#include <linux/random.h>
92#include <linux/jhash.h>
93#include <linux/rcupdate.h>
94#include <linux/times.h>
95#include <net/protocol.h>
96#include <net/ip.h>
97#include <net/route.h>
98#include <net/inetpeer.h>
99#include <net/sock.h>
100#include <net/ip_fib.h>
101#include <net/arp.h>
102#include <net/tcp.h>
103#include <net/icmp.h>
104#include <net/xfrm.h>
105#include <net/ip_mp_alg.h>
106#ifdef CONFIG_SYSCTL
107#include <linux/sysctl.h>
108#endif
109
110#define RT_FL_TOS(oldflp) \
111 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
112
113#define IP_MAX_MTU 0xFFF0
114
115#define RT_GC_TIMEOUT (300*HZ)
116
117static int ip_rt_min_delay = 2 * HZ;
118static int ip_rt_max_delay = 10 * HZ;
119static int ip_rt_max_size;
120static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
121static int ip_rt_gc_interval = 60 * HZ;
122static int ip_rt_gc_min_interval = HZ / 2;
123static int ip_rt_redirect_number = 9;
124static int ip_rt_redirect_load = HZ / 50;
125static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
126static int ip_rt_error_cost = HZ;
127static int ip_rt_error_burst = 5 * HZ;
128static int ip_rt_gc_elasticity = 8;
129static int ip_rt_mtu_expires = 10 * 60 * HZ;
130static int ip_rt_min_pmtu = 512 + 20 + 20;
131static int ip_rt_min_advmss = 256;
132static int ip_rt_secret_interval = 10 * 60 * HZ;
133static unsigned long rt_deadline;
134
135#define RTprint(a...) printk(KERN_DEBUG a)
136
137static struct timer_list rt_flush_timer;
138static struct timer_list rt_periodic_timer;
139static struct timer_list rt_secret_timer;
140
141/*
142 * Interface to generic destination cache.
143 */
144
145static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146static void ipv4_dst_destroy(struct dst_entry *dst);
147static void ipv4_dst_ifdown(struct dst_entry *dst,
148 struct net_device *dev, int how);
149static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150static void ipv4_link_failure(struct sk_buff *skb);
151static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152static int rt_garbage_collect(void);
153
154
155static struct dst_ops ipv4_dst_ops = {
156 .family = AF_INET,
157 .protocol = __constant_htons(ETH_P_IP),
158 .gc = rt_garbage_collect,
159 .check = ipv4_dst_check,
160 .destroy = ipv4_dst_destroy,
161 .ifdown = ipv4_dst_ifdown,
162 .negative_advice = ipv4_negative_advice,
163 .link_failure = ipv4_link_failure,
164 .update_pmtu = ip_rt_update_pmtu,
165 .entry_size = sizeof(struct rtable),
166};
167
168#define ECN_OR_COST(class) TC_PRIO_##class
169
170__u8 ip_tos2prio[16] = {
171 TC_PRIO_BESTEFFORT,
172 ECN_OR_COST(FILLER),
173 TC_PRIO_BESTEFFORT,
174 ECN_OR_COST(BESTEFFORT),
175 TC_PRIO_BULK,
176 ECN_OR_COST(BULK),
177 TC_PRIO_BULK,
178 ECN_OR_COST(BULK),
179 TC_PRIO_INTERACTIVE,
180 ECN_OR_COST(INTERACTIVE),
181 TC_PRIO_INTERACTIVE,
182 ECN_OR_COST(INTERACTIVE),
183 TC_PRIO_INTERACTIVE_BULK,
184 ECN_OR_COST(INTERACTIVE_BULK),
185 TC_PRIO_INTERACTIVE_BULK,
186 ECN_OR_COST(INTERACTIVE_BULK)
187};
188
189
190/*
191 * Route cache.
192 */
193
194/* The locking scheme is rather straight forward:
195 *
196 * 1) Read-Copy Update protects the buckets of the central route hash.
197 * 2) Only writers remove entries, and they hold the lock
198 * as they look at rtable reference counts.
199 * 3) Only readers acquire references to rtable entries,
200 * they do so with atomic increments and with the
201 * lock held.
202 */
203
204struct rt_hash_bucket {
205 struct rtable *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700206};
207#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
208/*
209 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210 * The size of this table is a power of two and depends on the number of CPUS.
211 */
212#if NR_CPUS >= 32
213#define RT_HASH_LOCK_SZ 4096
214#elif NR_CPUS >= 16
215#define RT_HASH_LOCK_SZ 2048
216#elif NR_CPUS >= 8
217#define RT_HASH_LOCK_SZ 1024
218#elif NR_CPUS >= 4
219#define RT_HASH_LOCK_SZ 512
220#else
221#define RT_HASH_LOCK_SZ 256
222#endif
223
224static spinlock_t *rt_hash_locks;
225# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
226# define rt_hash_lock_init() { \
227 int i; \
228 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
229 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
230 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
231 spin_lock_init(&rt_hash_locks[i]); \
232 }
233#else
234# define rt_hash_lock_addr(slot) NULL
235# define rt_hash_lock_init()
236#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700237
238static struct rt_hash_bucket *rt_hash_table;
239static unsigned rt_hash_mask;
240static int rt_hash_log;
241static unsigned int rt_hash_rnd;
242
243struct rt_cache_stat *rt_cache_stat;
244
245static int rt_intern_hash(unsigned hash, struct rtable *rth,
246 struct rtable **res);
247
248static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
249{
250 return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
251 & rt_hash_mask);
252}
253
254#ifdef CONFIG_PROC_FS
255struct rt_cache_iter_state {
256 int bucket;
257};
258
259static struct rtable *rt_cache_get_first(struct seq_file *seq)
260{
261 struct rtable *r = NULL;
262 struct rt_cache_iter_state *st = seq->private;
263
264 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
265 rcu_read_lock_bh();
266 r = rt_hash_table[st->bucket].chain;
267 if (r)
268 break;
269 rcu_read_unlock_bh();
270 }
271 return r;
272}
273
274static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
275{
276 struct rt_cache_iter_state *st = rcu_dereference(seq->private);
277
278 r = r->u.rt_next;
279 while (!r) {
280 rcu_read_unlock_bh();
281 if (--st->bucket < 0)
282 break;
283 rcu_read_lock_bh();
284 r = rt_hash_table[st->bucket].chain;
285 }
286 return r;
287}
288
289static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
290{
291 struct rtable *r = rt_cache_get_first(seq);
292
293 if (r)
294 while (pos && (r = rt_cache_get_next(seq, r)))
295 --pos;
296 return pos ? NULL : r;
297}
298
299static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
300{
301 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
302}
303
304static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
305{
306 struct rtable *r = NULL;
307
308 if (v == SEQ_START_TOKEN)
309 r = rt_cache_get_first(seq);
310 else
311 r = rt_cache_get_next(seq, v);
312 ++*pos;
313 return r;
314}
315
316static void rt_cache_seq_stop(struct seq_file *seq, void *v)
317{
318 if (v && v != SEQ_START_TOKEN)
319 rcu_read_unlock_bh();
320}
321
322static int rt_cache_seq_show(struct seq_file *seq, void *v)
323{
324 if (v == SEQ_START_TOKEN)
325 seq_printf(seq, "%-127s\n",
326 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
327 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
328 "HHUptod\tSpecDst");
329 else {
330 struct rtable *r = v;
331 char temp[256];
332
333 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
334 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
335 r->u.dst.dev ? r->u.dst.dev->name : "*",
336 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
337 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
338 r->u.dst.__use, 0, (unsigned long)r->rt_src,
339 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
340 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
341 dst_metric(&r->u.dst, RTAX_WINDOW),
342 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
343 dst_metric(&r->u.dst, RTAX_RTTVAR)),
344 r->fl.fl4_tos,
345 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
346 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
347 dev_queue_xmit) : 0,
348 r->rt_spec_dst);
349 seq_printf(seq, "%-127s\n", temp);
350 }
351 return 0;
352}
353
354static struct seq_operations rt_cache_seq_ops = {
355 .start = rt_cache_seq_start,
356 .next = rt_cache_seq_next,
357 .stop = rt_cache_seq_stop,
358 .show = rt_cache_seq_show,
359};
360
361static int rt_cache_seq_open(struct inode *inode, struct file *file)
362{
363 struct seq_file *seq;
364 int rc = -ENOMEM;
365 struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
366
367 if (!s)
368 goto out;
369 rc = seq_open(file, &rt_cache_seq_ops);
370 if (rc)
371 goto out_kfree;
372 seq = file->private_data;
373 seq->private = s;
374 memset(s, 0, sizeof(*s));
375out:
376 return rc;
377out_kfree:
378 kfree(s);
379 goto out;
380}
381
382static struct file_operations rt_cache_seq_fops = {
383 .owner = THIS_MODULE,
384 .open = rt_cache_seq_open,
385 .read = seq_read,
386 .llseek = seq_lseek,
387 .release = seq_release_private,
388};
389
390
391static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
392{
393 int cpu;
394
395 if (*pos == 0)
396 return SEQ_START_TOKEN;
397
398 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
399 if (!cpu_possible(cpu))
400 continue;
401 *pos = cpu+1;
402 return per_cpu_ptr(rt_cache_stat, cpu);
403 }
404 return NULL;
405}
406
407static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
408{
409 int cpu;
410
411 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
412 if (!cpu_possible(cpu))
413 continue;
414 *pos = cpu+1;
415 return per_cpu_ptr(rt_cache_stat, cpu);
416 }
417 return NULL;
418
419}
420
421static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
422{
423
424}
425
426static int rt_cpu_seq_show(struct seq_file *seq, void *v)
427{
428 struct rt_cache_stat *st = v;
429
430 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700431 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700432 return 0;
433 }
434
435 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
436 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
437 atomic_read(&ipv4_dst_ops.entries),
438 st->in_hit,
439 st->in_slow_tot,
440 st->in_slow_mc,
441 st->in_no_route,
442 st->in_brd,
443 st->in_martian_dst,
444 st->in_martian_src,
445
446 st->out_hit,
447 st->out_slow_tot,
448 st->out_slow_mc,
449
450 st->gc_total,
451 st->gc_ignored,
452 st->gc_goal_miss,
453 st->gc_dst_overflow,
454 st->in_hlist_search,
455 st->out_hlist_search
456 );
457 return 0;
458}
459
460static struct seq_operations rt_cpu_seq_ops = {
461 .start = rt_cpu_seq_start,
462 .next = rt_cpu_seq_next,
463 .stop = rt_cpu_seq_stop,
464 .show = rt_cpu_seq_show,
465};
466
467
468static int rt_cpu_seq_open(struct inode *inode, struct file *file)
469{
470 return seq_open(file, &rt_cpu_seq_ops);
471}
472
473static struct file_operations rt_cpu_seq_fops = {
474 .owner = THIS_MODULE,
475 .open = rt_cpu_seq_open,
476 .read = seq_read,
477 .llseek = seq_lseek,
478 .release = seq_release,
479};
480
481#endif /* CONFIG_PROC_FS */
482
483static __inline__ void rt_free(struct rtable *rt)
484{
485 multipath_remove(rt);
486 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
487}
488
489static __inline__ void rt_drop(struct rtable *rt)
490{
491 multipath_remove(rt);
492 ip_rt_put(rt);
493 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
494}
495
496static __inline__ int rt_fast_clean(struct rtable *rth)
497{
498 /* Kill broadcast/multicast entries very aggresively, if they
499 collide in hash table with more useful entries */
500 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
501 rth->fl.iif && rth->u.rt_next;
502}
503
504static __inline__ int rt_valuable(struct rtable *rth)
505{
506 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
507 rth->u.dst.expires;
508}
509
510static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
511{
512 unsigned long age;
513 int ret = 0;
514
515 if (atomic_read(&rth->u.dst.__refcnt))
516 goto out;
517
518 ret = 1;
519 if (rth->u.dst.expires &&
520 time_after_eq(jiffies, rth->u.dst.expires))
521 goto out;
522
523 age = jiffies - rth->u.dst.lastuse;
524 ret = 0;
525 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
526 (age <= tmo2 && rt_valuable(rth)))
527 goto out;
528 ret = 1;
529out: return ret;
530}
531
532/* Bits of score are:
533 * 31: very valuable
534 * 30: not quite useless
535 * 29..0: usage counter
536 */
537static inline u32 rt_score(struct rtable *rt)
538{
539 u32 score = jiffies - rt->u.dst.lastuse;
540
541 score = ~score & ~(3<<30);
542
543 if (rt_valuable(rt))
544 score |= (1<<31);
545
546 if (!rt->fl.iif ||
547 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
548 score |= (1<<30);
549
550 return score;
551}
552
553static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
554{
555 return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
556 fl1->oif == fl2->oif &&
557 fl1->iif == fl2->iif;
558}
559
560#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
561static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
562 struct rtable *expentry,
563 int *removed_count)
564{
565 int passedexpired = 0;
566 struct rtable **nextstep = NULL;
567 struct rtable **rthp = chain_head;
568 struct rtable *rth;
569
570 if (removed_count)
571 *removed_count = 0;
572
573 while ((rth = *rthp) != NULL) {
574 if (rth == expentry)
575 passedexpired = 1;
576
577 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0 &&
578 compare_keys(&(*rthp)->fl, &expentry->fl)) {
579 if (*rthp == expentry) {
580 *rthp = rth->u.rt_next;
581 continue;
582 } else {
583 *rthp = rth->u.rt_next;
584 rt_free(rth);
585 if (removed_count)
586 ++(*removed_count);
587 }
588 } else {
589 if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
590 passedexpired && !nextstep)
591 nextstep = &rth->u.rt_next;
592
593 rthp = &rth->u.rt_next;
594 }
595 }
596
597 rt_free(expentry);
598 if (removed_count)
599 ++(*removed_count);
600
601 return nextstep;
602}
603#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
604
605
606/* This runs via a timer and thus is always in BH context. */
607static void rt_check_expire(unsigned long dummy)
608{
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700609 static unsigned int rover;
610 unsigned int i = rover, goal;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700611 struct rtable *rth, **rthp;
612 unsigned long now = jiffies;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700613 u64 mult;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700614
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700615 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
616 if (ip_rt_gc_timeout > 1)
617 do_div(mult, ip_rt_gc_timeout);
618 goal = (unsigned int)mult;
619 if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
620 for (; goal > 0; goal--) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700621 unsigned long tmo = ip_rt_gc_timeout;
622
623 i = (i + 1) & rt_hash_mask;
624 rthp = &rt_hash_table[i].chain;
625
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700626 if (*rthp == 0)
627 continue;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700628 spin_lock(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700629 while ((rth = *rthp) != NULL) {
630 if (rth->u.dst.expires) {
631 /* Entry is expired even if it is in use */
632 if (time_before_eq(now, rth->u.dst.expires)) {
633 tmo >>= 1;
634 rthp = &rth->u.rt_next;
635 continue;
636 }
637 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
638 tmo >>= 1;
639 rthp = &rth->u.rt_next;
640 continue;
641 }
642
643 /* Cleanup aged off entries. */
644#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
645 /* remove all related balanced entries if necessary */
646 if (rth->u.dst.flags & DST_BALANCED) {
647 rthp = rt_remove_balanced_route(
648 &rt_hash_table[i].chain,
649 rth, NULL);
650 if (!rthp)
651 break;
652 } else {
653 *rthp = rth->u.rt_next;
654 rt_free(rth);
655 }
656#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
657 *rthp = rth->u.rt_next;
658 rt_free(rth);
659#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
660 }
Eric Dumazet22c047c2005-07-05 14:55:24 -0700661 spin_unlock(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700662
663 /* Fallback loop breaker. */
664 if (time_after(jiffies, now))
665 break;
666 }
667 rover = i;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700668 mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700669}
670
671/* This can run from both BH and non-BH contexts, the latter
672 * in the case of a forced flush event.
673 */
674static void rt_run_flush(unsigned long dummy)
675{
676 int i;
677 struct rtable *rth, *next;
678
679 rt_deadline = 0;
680
681 get_random_bytes(&rt_hash_rnd, 4);
682
683 for (i = rt_hash_mask; i >= 0; i--) {
Eric Dumazet22c047c2005-07-05 14:55:24 -0700684 spin_lock_bh(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700685 rth = rt_hash_table[i].chain;
686 if (rth)
687 rt_hash_table[i].chain = NULL;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700688 spin_unlock_bh(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700689
690 for (; rth; rth = next) {
691 next = rth->u.rt_next;
692 rt_free(rth);
693 }
694 }
695}
696
697static DEFINE_SPINLOCK(rt_flush_lock);
698
699void rt_cache_flush(int delay)
700{
701 unsigned long now = jiffies;
702 int user_mode = !in_softirq();
703
704 if (delay < 0)
705 delay = ip_rt_min_delay;
706
707 /* flush existing multipath state*/
708 multipath_flush();
709
710 spin_lock_bh(&rt_flush_lock);
711
712 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
713 long tmo = (long)(rt_deadline - now);
714
715 /* If flush timer is already running
716 and flush request is not immediate (delay > 0):
717
718 if deadline is not achieved, prolongate timer to "delay",
719 otherwise fire it at deadline time.
720 */
721
722 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
723 tmo = 0;
724
725 if (delay > tmo)
726 delay = tmo;
727 }
728
729 if (delay <= 0) {
730 spin_unlock_bh(&rt_flush_lock);
731 rt_run_flush(0);
732 return;
733 }
734
735 if (rt_deadline == 0)
736 rt_deadline = now + ip_rt_max_delay;
737
738 mod_timer(&rt_flush_timer, now+delay);
739 spin_unlock_bh(&rt_flush_lock);
740}
741
742static void rt_secret_rebuild(unsigned long dummy)
743{
744 unsigned long now = jiffies;
745
746 rt_cache_flush(0);
747 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
748}
749
750/*
751 Short description of GC goals.
752
753 We want to build algorithm, which will keep routing cache
754 at some equilibrium point, when number of aged off entries
755 is kept approximately equal to newly generated ones.
756
757 Current expiration strength is variable "expire".
758 We try to adjust it dynamically, so that if networking
759 is idle expires is large enough to keep enough of warm entries,
760 and when load increases it reduces to limit cache size.
761 */
762
763static int rt_garbage_collect(void)
764{
765 static unsigned long expire = RT_GC_TIMEOUT;
766 static unsigned long last_gc;
767 static int rover;
768 static int equilibrium;
769 struct rtable *rth, **rthp;
770 unsigned long now = jiffies;
771 int goal;
772
773 /*
774 * Garbage collection is pretty expensive,
775 * do not make it too frequently.
776 */
777
778 RT_CACHE_STAT_INC(gc_total);
779
780 if (now - last_gc < ip_rt_gc_min_interval &&
781 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
782 RT_CACHE_STAT_INC(gc_ignored);
783 goto out;
784 }
785
786 /* Calculate number of entries, which we want to expire now. */
787 goal = atomic_read(&ipv4_dst_ops.entries) -
788 (ip_rt_gc_elasticity << rt_hash_log);
789 if (goal <= 0) {
790 if (equilibrium < ipv4_dst_ops.gc_thresh)
791 equilibrium = ipv4_dst_ops.gc_thresh;
792 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
793 if (goal > 0) {
794 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
795 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
796 }
797 } else {
798 /* We are in dangerous area. Try to reduce cache really
799 * aggressively.
800 */
801 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
802 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
803 }
804
805 if (now - last_gc >= ip_rt_gc_min_interval)
806 last_gc = now;
807
808 if (goal <= 0) {
809 equilibrium += goal;
810 goto work_done;
811 }
812
813 do {
814 int i, k;
815
816 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
817 unsigned long tmo = expire;
818
819 k = (k + 1) & rt_hash_mask;
820 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700821 spin_lock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700822 while ((rth = *rthp) != NULL) {
823 if (!rt_may_expire(rth, tmo, expire)) {
824 tmo >>= 1;
825 rthp = &rth->u.rt_next;
826 continue;
827 }
828#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
829 /* remove all related balanced entries
830 * if necessary
831 */
832 if (rth->u.dst.flags & DST_BALANCED) {
833 int r;
834
835 rthp = rt_remove_balanced_route(
836 &rt_hash_table[i].chain,
837 rth,
838 &r);
839 goal -= r;
840 if (!rthp)
841 break;
842 } else {
843 *rthp = rth->u.rt_next;
844 rt_free(rth);
845 goal--;
846 }
847#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
848 *rthp = rth->u.rt_next;
849 rt_free(rth);
850 goal--;
851#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
852 }
Eric Dumazet22c047c2005-07-05 14:55:24 -0700853 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700854 if (goal <= 0)
855 break;
856 }
857 rover = k;
858
859 if (goal <= 0)
860 goto work_done;
861
862 /* Goal is not achieved. We stop process if:
863
864 - if expire reduced to zero. Otherwise, expire is halfed.
865 - if table is not full.
866 - if we are called from interrupt.
867 - jiffies check is just fallback/debug loop breaker.
868 We will not spin here for long time in any case.
869 */
870
871 RT_CACHE_STAT_INC(gc_goal_miss);
872
873 if (expire == 0)
874 break;
875
876 expire >>= 1;
877#if RT_CACHE_DEBUG >= 2
878 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
879 atomic_read(&ipv4_dst_ops.entries), goal, i);
880#endif
881
882 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
883 goto out;
884 } while (!in_softirq() && time_before_eq(jiffies, now));
885
886 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
887 goto out;
888 if (net_ratelimit())
889 printk(KERN_WARNING "dst cache overflow\n");
890 RT_CACHE_STAT_INC(gc_dst_overflow);
891 return 1;
892
893work_done:
894 expire += ip_rt_gc_min_interval;
895 if (expire > ip_rt_gc_timeout ||
896 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
897 expire = ip_rt_gc_timeout;
898#if RT_CACHE_DEBUG >= 2
899 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
900 atomic_read(&ipv4_dst_ops.entries), goal, rover);
901#endif
902out: return 0;
903}
904
905static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
906{
907 struct rtable *rth, **rthp;
908 unsigned long now;
909 struct rtable *cand, **candp;
910 u32 min_score;
911 int chain_length;
912 int attempts = !in_softirq();
913
914restart:
915 chain_length = 0;
916 min_score = ~(u32)0;
917 cand = NULL;
918 candp = NULL;
919 now = jiffies;
920
921 rthp = &rt_hash_table[hash].chain;
922
Eric Dumazet22c047c2005-07-05 14:55:24 -0700923 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700924 while ((rth = *rthp) != NULL) {
925#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
926 if (!(rth->u.dst.flags & DST_BALANCED) &&
927 compare_keys(&rth->fl, &rt->fl)) {
928#else
929 if (compare_keys(&rth->fl, &rt->fl)) {
930#endif
931 /* Put it first */
932 *rthp = rth->u.rt_next;
933 /*
934 * Since lookup is lockfree, the deletion
935 * must be visible to another weakly ordered CPU before
936 * the insertion at the start of the hash chain.
937 */
938 rcu_assign_pointer(rth->u.rt_next,
939 rt_hash_table[hash].chain);
940 /*
941 * Since lookup is lockfree, the update writes
942 * must be ordered for consistency on SMP.
943 */
944 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
945
946 rth->u.dst.__use++;
947 dst_hold(&rth->u.dst);
948 rth->u.dst.lastuse = now;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700949 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700950
951 rt_drop(rt);
952 *rp = rth;
953 return 0;
954 }
955
956 if (!atomic_read(&rth->u.dst.__refcnt)) {
957 u32 score = rt_score(rth);
958
959 if (score <= min_score) {
960 cand = rth;
961 candp = rthp;
962 min_score = score;
963 }
964 }
965
966 chain_length++;
967
968 rthp = &rth->u.rt_next;
969 }
970
971 if (cand) {
972 /* ip_rt_gc_elasticity used to be average length of chain
973 * length, when exceeded gc becomes really aggressive.
974 *
975 * The second limit is less certain. At the moment it allows
976 * only 2 entries per bucket. We will see.
977 */
978 if (chain_length > ip_rt_gc_elasticity) {
979 *candp = cand->u.rt_next;
980 rt_free(cand);
981 }
982 }
983
984 /* Try to bind route to arp only if it is output
985 route or unicast forwarding path.
986 */
987 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
988 int err = arp_bind_neighbour(&rt->u.dst);
989 if (err) {
Eric Dumazet22c047c2005-07-05 14:55:24 -0700990 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700991
992 if (err != -ENOBUFS) {
993 rt_drop(rt);
994 return err;
995 }
996
997 /* Neighbour tables are full and nothing
998 can be released. Try to shrink route cache,
999 it is most likely it holds some neighbour records.
1000 */
1001 if (attempts-- > 0) {
1002 int saved_elasticity = ip_rt_gc_elasticity;
1003 int saved_int = ip_rt_gc_min_interval;
1004 ip_rt_gc_elasticity = 1;
1005 ip_rt_gc_min_interval = 0;
1006 rt_garbage_collect();
1007 ip_rt_gc_min_interval = saved_int;
1008 ip_rt_gc_elasticity = saved_elasticity;
1009 goto restart;
1010 }
1011
1012 if (net_ratelimit())
1013 printk(KERN_WARNING "Neighbour table overflow.\n");
1014 rt_drop(rt);
1015 return -ENOBUFS;
1016 }
1017 }
1018
1019 rt->u.rt_next = rt_hash_table[hash].chain;
1020#if RT_CACHE_DEBUG >= 2
1021 if (rt->u.rt_next) {
1022 struct rtable *trt;
1023 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1024 NIPQUAD(rt->rt_dst));
1025 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
1026 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1027 printk("\n");
1028 }
1029#endif
1030 rt_hash_table[hash].chain = rt;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001031 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001032 *rp = rt;
1033 return 0;
1034}
1035
1036void rt_bind_peer(struct rtable *rt, int create)
1037{
1038 static DEFINE_SPINLOCK(rt_peer_lock);
1039 struct inet_peer *peer;
1040
1041 peer = inet_getpeer(rt->rt_dst, create);
1042
1043 spin_lock_bh(&rt_peer_lock);
1044 if (rt->peer == NULL) {
1045 rt->peer = peer;
1046 peer = NULL;
1047 }
1048 spin_unlock_bh(&rt_peer_lock);
1049 if (peer)
1050 inet_putpeer(peer);
1051}
1052
1053/*
1054 * Peer allocation may fail only in serious out-of-memory conditions. However
1055 * we still can generate some output.
1056 * Random ID selection looks a bit dangerous because we have no chances to
1057 * select ID being unique in a reasonable period of time.
1058 * But broken packet identifier may be better than no packet at all.
1059 */
1060static void ip_select_fb_ident(struct iphdr *iph)
1061{
1062 static DEFINE_SPINLOCK(ip_fb_id_lock);
1063 static u32 ip_fallback_id;
1064 u32 salt;
1065
1066 spin_lock_bh(&ip_fb_id_lock);
1067 salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
1068 iph->id = htons(salt & 0xFFFF);
1069 ip_fallback_id = salt;
1070 spin_unlock_bh(&ip_fb_id_lock);
1071}
1072
1073void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1074{
1075 struct rtable *rt = (struct rtable *) dst;
1076
1077 if (rt) {
1078 if (rt->peer == NULL)
1079 rt_bind_peer(rt, 1);
1080
1081 /* If peer is attached to destination, it is never detached,
1082 so that we need not to grab a lock to dereference it.
1083 */
1084 if (rt->peer) {
1085 iph->id = htons(inet_getid(rt->peer, more));
1086 return;
1087 }
1088 } else
Stephen Hemminger9c2b3322005-04-19 22:39:42 -07001089 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1090 __builtin_return_address(0));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001091
1092 ip_select_fb_ident(iph);
1093}
1094
1095static void rt_del(unsigned hash, struct rtable *rt)
1096{
1097 struct rtable **rthp;
1098
Eric Dumazet22c047c2005-07-05 14:55:24 -07001099 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001100 ip_rt_put(rt);
1101 for (rthp = &rt_hash_table[hash].chain; *rthp;
1102 rthp = &(*rthp)->u.rt_next)
1103 if (*rthp == rt) {
1104 *rthp = rt->u.rt_next;
1105 rt_free(rt);
1106 break;
1107 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001108 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001109}
1110
1111void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
1112 u32 saddr, u8 tos, struct net_device *dev)
1113{
1114 int i, k;
1115 struct in_device *in_dev = in_dev_get(dev);
1116 struct rtable *rth, **rthp;
1117 u32 skeys[2] = { saddr, 0 };
1118 int ikeys[2] = { dev->ifindex, 0 };
1119
1120 tos &= IPTOS_RT_MASK;
1121
1122 if (!in_dev)
1123 return;
1124
1125 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1126 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1127 goto reject_redirect;
1128
1129 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1130 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1131 goto reject_redirect;
1132 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1133 goto reject_redirect;
1134 } else {
1135 if (inet_addr_type(new_gw) != RTN_UNICAST)
1136 goto reject_redirect;
1137 }
1138
1139 for (i = 0; i < 2; i++) {
1140 for (k = 0; k < 2; k++) {
1141 unsigned hash = rt_hash_code(daddr,
1142 skeys[i] ^ (ikeys[k] << 5),
1143 tos);
1144
1145 rthp=&rt_hash_table[hash].chain;
1146
1147 rcu_read_lock();
1148 while ((rth = rcu_dereference(*rthp)) != NULL) {
1149 struct rtable *rt;
1150
1151 if (rth->fl.fl4_dst != daddr ||
1152 rth->fl.fl4_src != skeys[i] ||
1153 rth->fl.fl4_tos != tos ||
1154 rth->fl.oif != ikeys[k] ||
1155 rth->fl.iif != 0) {
1156 rthp = &rth->u.rt_next;
1157 continue;
1158 }
1159
1160 if (rth->rt_dst != daddr ||
1161 rth->rt_src != saddr ||
1162 rth->u.dst.error ||
1163 rth->rt_gateway != old_gw ||
1164 rth->u.dst.dev != dev)
1165 break;
1166
1167 dst_hold(&rth->u.dst);
1168 rcu_read_unlock();
1169
1170 rt = dst_alloc(&ipv4_dst_ops);
1171 if (rt == NULL) {
1172 ip_rt_put(rth);
1173 in_dev_put(in_dev);
1174 return;
1175 }
1176
1177 /* Copy all the information. */
1178 *rt = *rth;
1179 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1180 rt->u.dst.__use = 1;
1181 atomic_set(&rt->u.dst.__refcnt, 1);
1182 rt->u.dst.child = NULL;
1183 if (rt->u.dst.dev)
1184 dev_hold(rt->u.dst.dev);
1185 if (rt->idev)
1186 in_dev_hold(rt->idev);
1187 rt->u.dst.obsolete = 0;
1188 rt->u.dst.lastuse = jiffies;
1189 rt->u.dst.path = &rt->u.dst;
1190 rt->u.dst.neighbour = NULL;
1191 rt->u.dst.hh = NULL;
1192 rt->u.dst.xfrm = NULL;
1193
1194 rt->rt_flags |= RTCF_REDIRECTED;
1195
1196 /* Gateway is different ... */
1197 rt->rt_gateway = new_gw;
1198
1199 /* Redirect received -> path was valid */
1200 dst_confirm(&rth->u.dst);
1201
1202 if (rt->peer)
1203 atomic_inc(&rt->peer->refcnt);
1204
1205 if (arp_bind_neighbour(&rt->u.dst) ||
1206 !(rt->u.dst.neighbour->nud_state &
1207 NUD_VALID)) {
1208 if (rt->u.dst.neighbour)
1209 neigh_event_send(rt->u.dst.neighbour, NULL);
1210 ip_rt_put(rth);
1211 rt_drop(rt);
1212 goto do_next;
1213 }
1214
1215 rt_del(hash, rth);
1216 if (!rt_intern_hash(hash, rt, &rt))
1217 ip_rt_put(rt);
1218 goto do_next;
1219 }
1220 rcu_read_unlock();
1221 do_next:
1222 ;
1223 }
1224 }
1225 in_dev_put(in_dev);
1226 return;
1227
1228reject_redirect:
1229#ifdef CONFIG_IP_ROUTE_VERBOSE
1230 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1231 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1232 "%u.%u.%u.%u ignored.\n"
1233 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
1234 "tos %02x\n",
1235 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1236 NIPQUAD(saddr), NIPQUAD(daddr), tos);
1237#endif
1238 in_dev_put(in_dev);
1239}
1240
1241static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1242{
1243 struct rtable *rt = (struct rtable*)dst;
1244 struct dst_entry *ret = dst;
1245
1246 if (rt) {
1247 if (dst->obsolete) {
1248 ip_rt_put(rt);
1249 ret = NULL;
1250 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1251 rt->u.dst.expires) {
1252 unsigned hash = rt_hash_code(rt->fl.fl4_dst,
1253 rt->fl.fl4_src ^
1254 (rt->fl.oif << 5),
1255 rt->fl.fl4_tos);
1256#if RT_CACHE_DEBUG >= 1
1257 printk(KERN_DEBUG "ip_rt_advice: redirect to "
1258 "%u.%u.%u.%u/%02x dropped\n",
1259 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1260#endif
1261 rt_del(hash, rt);
1262 ret = NULL;
1263 }
1264 }
1265 return ret;
1266}
1267
1268/*
1269 * Algorithm:
1270 * 1. The first ip_rt_redirect_number redirects are sent
1271 * with exponential backoff, then we stop sending them at all,
1272 * assuming that the host ignores our redirects.
1273 * 2. If we did not see packets requiring redirects
1274 * during ip_rt_redirect_silence, we assume that the host
1275 * forgot redirected route and start to send redirects again.
1276 *
1277 * This algorithm is much cheaper and more intelligent than dumb load limiting
1278 * in icmp.c.
1279 *
1280 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1281 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1282 */
1283
1284void ip_rt_send_redirect(struct sk_buff *skb)
1285{
1286 struct rtable *rt = (struct rtable*)skb->dst;
1287 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1288
1289 if (!in_dev)
1290 return;
1291
1292 if (!IN_DEV_TX_REDIRECTS(in_dev))
1293 goto out;
1294
1295 /* No redirected packets during ip_rt_redirect_silence;
1296 * reset the algorithm.
1297 */
1298 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1299 rt->u.dst.rate_tokens = 0;
1300
1301 /* Too many ignored redirects; do not send anything
1302 * set u.dst.rate_last to the last seen redirected packet.
1303 */
1304 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1305 rt->u.dst.rate_last = jiffies;
1306 goto out;
1307 }
1308
1309 /* Check for load limit; set rate_last to the latest sent
1310 * redirect.
1311 */
1312 if (time_after(jiffies,
1313 (rt->u.dst.rate_last +
1314 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1315 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1316 rt->u.dst.rate_last = jiffies;
1317 ++rt->u.dst.rate_tokens;
1318#ifdef CONFIG_IP_ROUTE_VERBOSE
1319 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1320 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1321 net_ratelimit())
1322 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1323 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1324 NIPQUAD(rt->rt_src), rt->rt_iif,
1325 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1326#endif
1327 }
1328out:
1329 in_dev_put(in_dev);
1330}
1331
1332static int ip_error(struct sk_buff *skb)
1333{
1334 struct rtable *rt = (struct rtable*)skb->dst;
1335 unsigned long now;
1336 int code;
1337
1338 switch (rt->u.dst.error) {
1339 case EINVAL:
1340 default:
1341 goto out;
1342 case EHOSTUNREACH:
1343 code = ICMP_HOST_UNREACH;
1344 break;
1345 case ENETUNREACH:
1346 code = ICMP_NET_UNREACH;
1347 break;
1348 case EACCES:
1349 code = ICMP_PKT_FILTERED;
1350 break;
1351 }
1352
1353 now = jiffies;
1354 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1355 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1356 rt->u.dst.rate_tokens = ip_rt_error_burst;
1357 rt->u.dst.rate_last = now;
1358 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1359 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1360 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1361 }
1362
1363out: kfree_skb(skb);
1364 return 0;
1365}
1366
1367/*
1368 * The last two values are not from the RFC but
1369 * are needed for AMPRnet AX.25 paths.
1370 */
1371
1372static unsigned short mtu_plateau[] =
1373{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1374
1375static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1376{
1377 int i;
1378
1379 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1380 if (old_mtu > mtu_plateau[i])
1381 return mtu_plateau[i];
1382 return 68;
1383}
1384
1385unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1386{
1387 int i;
1388 unsigned short old_mtu = ntohs(iph->tot_len);
1389 struct rtable *rth;
1390 u32 skeys[2] = { iph->saddr, 0, };
1391 u32 daddr = iph->daddr;
1392 u8 tos = iph->tos & IPTOS_RT_MASK;
1393 unsigned short est_mtu = 0;
1394
1395 if (ipv4_config.no_pmtu_disc)
1396 return 0;
1397
1398 for (i = 0; i < 2; i++) {
1399 unsigned hash = rt_hash_code(daddr, skeys[i], tos);
1400
1401 rcu_read_lock();
1402 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1403 rth = rcu_dereference(rth->u.rt_next)) {
1404 if (rth->fl.fl4_dst == daddr &&
1405 rth->fl.fl4_src == skeys[i] &&
1406 rth->rt_dst == daddr &&
1407 rth->rt_src == iph->saddr &&
1408 rth->fl.fl4_tos == tos &&
1409 rth->fl.iif == 0 &&
1410 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1411 unsigned short mtu = new_mtu;
1412
1413 if (new_mtu < 68 || new_mtu >= old_mtu) {
1414
1415 /* BSD 4.2 compatibility hack :-( */
1416 if (mtu == 0 &&
1417 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1418 old_mtu >= 68 + (iph->ihl << 2))
1419 old_mtu -= iph->ihl << 2;
1420
1421 mtu = guess_mtu(old_mtu);
1422 }
1423 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1424 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1425 dst_confirm(&rth->u.dst);
1426 if (mtu < ip_rt_min_pmtu) {
1427 mtu = ip_rt_min_pmtu;
1428 rth->u.dst.metrics[RTAX_LOCK-1] |=
1429 (1 << RTAX_MTU);
1430 }
1431 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1432 dst_set_expires(&rth->u.dst,
1433 ip_rt_mtu_expires);
1434 }
1435 est_mtu = mtu;
1436 }
1437 }
1438 }
1439 rcu_read_unlock();
1440 }
1441 return est_mtu ? : new_mtu;
1442}
1443
1444static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1445{
1446 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1447 !(dst_metric_locked(dst, RTAX_MTU))) {
1448 if (mtu < ip_rt_min_pmtu) {
1449 mtu = ip_rt_min_pmtu;
1450 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1451 }
1452 dst->metrics[RTAX_MTU-1] = mtu;
1453 dst_set_expires(dst, ip_rt_mtu_expires);
1454 }
1455}
1456
1457static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1458{
1459 return NULL;
1460}
1461
1462static void ipv4_dst_destroy(struct dst_entry *dst)
1463{
1464 struct rtable *rt = (struct rtable *) dst;
1465 struct inet_peer *peer = rt->peer;
1466 struct in_device *idev = rt->idev;
1467
1468 if (peer) {
1469 rt->peer = NULL;
1470 inet_putpeer(peer);
1471 }
1472
1473 if (idev) {
1474 rt->idev = NULL;
1475 in_dev_put(idev);
1476 }
1477}
1478
1479static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1480 int how)
1481{
1482 struct rtable *rt = (struct rtable *) dst;
1483 struct in_device *idev = rt->idev;
1484 if (dev != &loopback_dev && idev && idev->dev == dev) {
1485 struct in_device *loopback_idev = in_dev_get(&loopback_dev);
1486 if (loopback_idev) {
1487 rt->idev = loopback_idev;
1488 in_dev_put(idev);
1489 }
1490 }
1491}
1492
1493static void ipv4_link_failure(struct sk_buff *skb)
1494{
1495 struct rtable *rt;
1496
1497 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1498
1499 rt = (struct rtable *) skb->dst;
1500 if (rt)
1501 dst_set_expires(&rt->u.dst, 0);
1502}
1503
1504static int ip_rt_bug(struct sk_buff *skb)
1505{
1506 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1507 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
1508 skb->dev ? skb->dev->name : "?");
1509 kfree_skb(skb);
1510 return 0;
1511}
1512
1513/*
1514 We do not cache source address of outgoing interface,
1515 because it is used only by IP RR, TS and SRR options,
1516 so that it out of fast path.
1517
1518 BTW remember: "addr" is allowed to be not aligned
1519 in IP options!
1520 */
1521
1522void ip_rt_get_source(u8 *addr, struct rtable *rt)
1523{
1524 u32 src;
1525 struct fib_result res;
1526
1527 if (rt->fl.iif == 0)
1528 src = rt->rt_src;
1529 else if (fib_lookup(&rt->fl, &res) == 0) {
1530 src = FIB_RES_PREFSRC(res);
1531 fib_res_put(&res);
1532 } else
1533 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1534 RT_SCOPE_UNIVERSE);
1535 memcpy(addr, &src, 4);
1536}
1537
1538#ifdef CONFIG_NET_CLS_ROUTE
1539static void set_class_tag(struct rtable *rt, u32 tag)
1540{
1541 if (!(rt->u.dst.tclassid & 0xFFFF))
1542 rt->u.dst.tclassid |= tag & 0xFFFF;
1543 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1544 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1545}
1546#endif
1547
1548static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1549{
1550 struct fib_info *fi = res->fi;
1551
1552 if (fi) {
1553 if (FIB_RES_GW(*res) &&
1554 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1555 rt->rt_gateway = FIB_RES_GW(*res);
1556 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1557 sizeof(rt->u.dst.metrics));
1558 if (fi->fib_mtu == 0) {
1559 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1560 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1561 rt->rt_gateway != rt->rt_dst &&
1562 rt->u.dst.dev->mtu > 576)
1563 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1564 }
1565#ifdef CONFIG_NET_CLS_ROUTE
1566 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1567#endif
1568 } else
1569 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1570
1571 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1572 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1573 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1574 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1575 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1576 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1577 ip_rt_min_advmss);
1578 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1579 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1580
1581#ifdef CONFIG_NET_CLS_ROUTE
1582#ifdef CONFIG_IP_MULTIPLE_TABLES
1583 set_class_tag(rt, fib_rules_tclass(res));
1584#endif
1585 set_class_tag(rt, itag);
1586#endif
1587 rt->rt_type = res->type;
1588}
1589
1590static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
1591 u8 tos, struct net_device *dev, int our)
1592{
1593 unsigned hash;
1594 struct rtable *rth;
1595 u32 spec_dst;
1596 struct in_device *in_dev = in_dev_get(dev);
1597 u32 itag = 0;
1598
1599 /* Primary sanity checks. */
1600
1601 if (in_dev == NULL)
1602 return -EINVAL;
1603
1604 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1605 skb->protocol != htons(ETH_P_IP))
1606 goto e_inval;
1607
1608 if (ZERONET(saddr)) {
1609 if (!LOCAL_MCAST(daddr))
1610 goto e_inval;
1611 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1612 } else if (fib_validate_source(saddr, 0, tos, 0,
1613 dev, &spec_dst, &itag) < 0)
1614 goto e_inval;
1615
1616 rth = dst_alloc(&ipv4_dst_ops);
1617 if (!rth)
1618 goto e_nobufs;
1619
1620 rth->u.dst.output= ip_rt_bug;
1621
1622 atomic_set(&rth->u.dst.__refcnt, 1);
1623 rth->u.dst.flags= DST_HOST;
1624 if (in_dev->cnf.no_policy)
1625 rth->u.dst.flags |= DST_NOPOLICY;
1626 rth->fl.fl4_dst = daddr;
1627 rth->rt_dst = daddr;
1628 rth->fl.fl4_tos = tos;
1629#ifdef CONFIG_IP_ROUTE_FWMARK
1630 rth->fl.fl4_fwmark= skb->nfmark;
1631#endif
1632 rth->fl.fl4_src = saddr;
1633 rth->rt_src = saddr;
1634#ifdef CONFIG_NET_CLS_ROUTE
1635 rth->u.dst.tclassid = itag;
1636#endif
1637 rth->rt_iif =
1638 rth->fl.iif = dev->ifindex;
1639 rth->u.dst.dev = &loopback_dev;
1640 dev_hold(rth->u.dst.dev);
1641 rth->idev = in_dev_get(rth->u.dst.dev);
1642 rth->fl.oif = 0;
1643 rth->rt_gateway = daddr;
1644 rth->rt_spec_dst= spec_dst;
1645 rth->rt_type = RTN_MULTICAST;
1646 rth->rt_flags = RTCF_MULTICAST;
1647 if (our) {
1648 rth->u.dst.input= ip_local_deliver;
1649 rth->rt_flags |= RTCF_LOCAL;
1650 }
1651
1652#ifdef CONFIG_IP_MROUTE
1653 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1654 rth->u.dst.input = ip_mr_input;
1655#endif
1656 RT_CACHE_STAT_INC(in_slow_mc);
1657
1658 in_dev_put(in_dev);
1659 hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
1660 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1661
1662e_nobufs:
1663 in_dev_put(in_dev);
1664 return -ENOBUFS;
1665
1666e_inval:
1667 in_dev_put(in_dev);
1668 return -EINVAL;
1669}
1670
1671
1672static void ip_handle_martian_source(struct net_device *dev,
1673 struct in_device *in_dev,
1674 struct sk_buff *skb,
1675 u32 daddr,
1676 u32 saddr)
1677{
1678 RT_CACHE_STAT_INC(in_martian_src);
1679#ifdef CONFIG_IP_ROUTE_VERBOSE
1680 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1681 /*
1682 * RFC1812 recommendation, if source is martian,
1683 * the only hint is MAC header.
1684 */
1685 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1686 "%u.%u.%u.%u, on dev %s\n",
1687 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
Olaf Kirch0b7f22a2005-07-11 21:01:42 -07001688 if (dev->hard_header_len && skb->mac.raw) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001689 int i;
1690 unsigned char *p = skb->mac.raw;
1691 printk(KERN_WARNING "ll header: ");
1692 for (i = 0; i < dev->hard_header_len; i++, p++) {
1693 printk("%02x", *p);
1694 if (i < (dev->hard_header_len - 1))
1695 printk(":");
1696 }
1697 printk("\n");
1698 }
1699 }
1700#endif
1701}
1702
1703static inline int __mkroute_input(struct sk_buff *skb,
1704 struct fib_result* res,
1705 struct in_device *in_dev,
1706 u32 daddr, u32 saddr, u32 tos,
1707 struct rtable **result)
1708{
1709
1710 struct rtable *rth;
1711 int err;
1712 struct in_device *out_dev;
1713 unsigned flags = 0;
1714 u32 spec_dst, itag;
1715
1716 /* get a working reference to the output device */
1717 out_dev = in_dev_get(FIB_RES_DEV(*res));
1718 if (out_dev == NULL) {
1719 if (net_ratelimit())
1720 printk(KERN_CRIT "Bug in ip_route_input" \
1721 "_slow(). Please, report\n");
1722 return -EINVAL;
1723 }
1724
1725
1726 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1727 in_dev->dev, &spec_dst, &itag);
1728 if (err < 0) {
1729 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1730 saddr);
1731
1732 err = -EINVAL;
1733 goto cleanup;
1734 }
1735
1736 if (err)
1737 flags |= RTCF_DIRECTSRC;
1738
1739 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1740 (IN_DEV_SHARED_MEDIA(out_dev) ||
1741 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1742 flags |= RTCF_DOREDIRECT;
1743
1744 if (skb->protocol != htons(ETH_P_IP)) {
1745 /* Not IP (i.e. ARP). Do not create route, if it is
1746 * invalid for proxy arp. DNAT routes are always valid.
1747 */
1748 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1749 err = -EINVAL;
1750 goto cleanup;
1751 }
1752 }
1753
1754
1755 rth = dst_alloc(&ipv4_dst_ops);
1756 if (!rth) {
1757 err = -ENOBUFS;
1758 goto cleanup;
1759 }
1760
1761 rth->u.dst.flags= DST_HOST;
1762#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
1763 if (res->fi->fib_nhs > 1)
1764 rth->u.dst.flags |= DST_BALANCED;
1765#endif
1766 if (in_dev->cnf.no_policy)
1767 rth->u.dst.flags |= DST_NOPOLICY;
1768 if (in_dev->cnf.no_xfrm)
1769 rth->u.dst.flags |= DST_NOXFRM;
1770 rth->fl.fl4_dst = daddr;
1771 rth->rt_dst = daddr;
1772 rth->fl.fl4_tos = tos;
1773#ifdef CONFIG_IP_ROUTE_FWMARK
1774 rth->fl.fl4_fwmark= skb->nfmark;
1775#endif
1776 rth->fl.fl4_src = saddr;
1777 rth->rt_src = saddr;
1778 rth->rt_gateway = daddr;
1779 rth->rt_iif =
1780 rth->fl.iif = in_dev->dev->ifindex;
1781 rth->u.dst.dev = (out_dev)->dev;
1782 dev_hold(rth->u.dst.dev);
1783 rth->idev = in_dev_get(rth->u.dst.dev);
1784 rth->fl.oif = 0;
1785 rth->rt_spec_dst= spec_dst;
1786
1787 rth->u.dst.input = ip_forward;
1788 rth->u.dst.output = ip_output;
1789
1790 rt_set_nexthop(rth, res, itag);
1791
1792 rth->rt_flags = flags;
1793
1794 *result = rth;
1795 err = 0;
1796 cleanup:
1797 /* release the working reference to the output device */
1798 in_dev_put(out_dev);
1799 return err;
1800}
1801
1802static inline int ip_mkroute_input_def(struct sk_buff *skb,
1803 struct fib_result* res,
1804 const struct flowi *fl,
1805 struct in_device *in_dev,
1806 u32 daddr, u32 saddr, u32 tos)
1807{
Chuck Short7abaa272005-06-22 22:10:23 -07001808 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001809 int err;
1810 unsigned hash;
1811
1812#ifdef CONFIG_IP_ROUTE_MULTIPATH
1813 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1814 fib_select_multipath(fl, res);
1815#endif
1816
1817 /* create a routing cache entry */
1818 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1819 if (err)
1820 return err;
1821 atomic_set(&rth->u.dst.__refcnt, 1);
1822
1823 /* put it into the cache */
1824 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1825 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1826}
1827
1828static inline int ip_mkroute_input(struct sk_buff *skb,
1829 struct fib_result* res,
1830 const struct flowi *fl,
1831 struct in_device *in_dev,
1832 u32 daddr, u32 saddr, u32 tos)
1833{
1834#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
Chuck Short7abaa272005-06-22 22:10:23 -07001835 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001836 unsigned char hop, hopcount, lasthop;
1837 int err = -EINVAL;
1838 unsigned int hash;
1839
1840 if (res->fi)
1841 hopcount = res->fi->fib_nhs;
1842 else
1843 hopcount = 1;
1844
1845 lasthop = hopcount - 1;
1846
1847 /* distinguish between multipath and singlepath */
1848 if (hopcount < 2)
1849 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
1850 saddr, tos);
1851
1852 /* add all alternatives to the routing cache */
1853 for (hop = 0; hop < hopcount; hop++) {
1854 res->nh_sel = hop;
1855
1856 /* create a routing cache entry */
1857 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
1858 &rth);
1859 if (err)
1860 return err;
1861
1862 /* put it into the cache */
1863 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
1864 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1865 if (err)
1866 return err;
1867
1868 /* forward hop information to multipath impl. */
1869 multipath_set_nhinfo(rth,
1870 FIB_RES_NETWORK(*res),
1871 FIB_RES_NETMASK(*res),
1872 res->prefixlen,
1873 &FIB_RES_NH(*res));
1874
1875 /* only for the last hop the reference count is handled
1876 * outside
1877 */
1878 if (hop == lasthop)
1879 atomic_set(&(skb->dst->__refcnt), 1);
1880 }
1881 return err;
1882#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1883 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
1884#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
1885}
1886
1887
1888/*
1889 * NOTE. We drop all the packets that has local source
1890 * addresses, because every properly looped back packet
1891 * must have correct destination already attached by output routine.
1892 *
1893 * Such approach solves two big problems:
1894 * 1. Not simplex devices are handled properly.
1895 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1896 */
1897
1898static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
1899 u8 tos, struct net_device *dev)
1900{
1901 struct fib_result res;
1902 struct in_device *in_dev = in_dev_get(dev);
1903 struct flowi fl = { .nl_u = { .ip4_u =
1904 { .daddr = daddr,
1905 .saddr = saddr,
1906 .tos = tos,
1907 .scope = RT_SCOPE_UNIVERSE,
1908#ifdef CONFIG_IP_ROUTE_FWMARK
1909 .fwmark = skb->nfmark
1910#endif
1911 } },
1912 .iif = dev->ifindex };
1913 unsigned flags = 0;
1914 u32 itag = 0;
1915 struct rtable * rth;
1916 unsigned hash;
1917 u32 spec_dst;
1918 int err = -EINVAL;
1919 int free_res = 0;
1920
1921 /* IP on this device is disabled. */
1922
1923 if (!in_dev)
1924 goto out;
1925
1926 /* Check for the most weird martians, which can be not detected
1927 by fib_lookup.
1928 */
1929
1930 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1931 goto martian_source;
1932
1933 if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
1934 goto brd_input;
1935
1936 /* Accept zero addresses only to limited broadcast;
1937 * I even do not know to fix it or not. Waiting for complains :-)
1938 */
1939 if (ZERONET(saddr))
1940 goto martian_source;
1941
1942 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1943 goto martian_destination;
1944
1945 /*
1946 * Now we are ready to route packet.
1947 */
1948 if ((err = fib_lookup(&fl, &res)) != 0) {
1949 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001950 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001951 goto no_route;
1952 }
1953 free_res = 1;
1954
1955 RT_CACHE_STAT_INC(in_slow_tot);
1956
1957 if (res.type == RTN_BROADCAST)
1958 goto brd_input;
1959
1960 if (res.type == RTN_LOCAL) {
1961 int result;
1962 result = fib_validate_source(saddr, daddr, tos,
1963 loopback_dev.ifindex,
1964 dev, &spec_dst, &itag);
1965 if (result < 0)
1966 goto martian_source;
1967 if (result)
1968 flags |= RTCF_DIRECTSRC;
1969 spec_dst = daddr;
1970 goto local_input;
1971 }
1972
1973 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001974 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001975 if (res.type != RTN_UNICAST)
1976 goto martian_destination;
1977
1978 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1979 if (err == -ENOBUFS)
1980 goto e_nobufs;
1981 if (err == -EINVAL)
1982 goto e_inval;
1983
1984done:
1985 in_dev_put(in_dev);
1986 if (free_res)
1987 fib_res_put(&res);
1988out: return err;
1989
1990brd_input:
1991 if (skb->protocol != htons(ETH_P_IP))
1992 goto e_inval;
1993
1994 if (ZERONET(saddr))
1995 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1996 else {
1997 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1998 &itag);
1999 if (err < 0)
2000 goto martian_source;
2001 if (err)
2002 flags |= RTCF_DIRECTSRC;
2003 }
2004 flags |= RTCF_BROADCAST;
2005 res.type = RTN_BROADCAST;
2006 RT_CACHE_STAT_INC(in_brd);
2007
2008local_input:
2009 rth = dst_alloc(&ipv4_dst_ops);
2010 if (!rth)
2011 goto e_nobufs;
2012
2013 rth->u.dst.output= ip_rt_bug;
2014
2015 atomic_set(&rth->u.dst.__refcnt, 1);
2016 rth->u.dst.flags= DST_HOST;
2017 if (in_dev->cnf.no_policy)
2018 rth->u.dst.flags |= DST_NOPOLICY;
2019 rth->fl.fl4_dst = daddr;
2020 rth->rt_dst = daddr;
2021 rth->fl.fl4_tos = tos;
2022#ifdef CONFIG_IP_ROUTE_FWMARK
2023 rth->fl.fl4_fwmark= skb->nfmark;
2024#endif
2025 rth->fl.fl4_src = saddr;
2026 rth->rt_src = saddr;
2027#ifdef CONFIG_NET_CLS_ROUTE
2028 rth->u.dst.tclassid = itag;
2029#endif
2030 rth->rt_iif =
2031 rth->fl.iif = dev->ifindex;
2032 rth->u.dst.dev = &loopback_dev;
2033 dev_hold(rth->u.dst.dev);
2034 rth->idev = in_dev_get(rth->u.dst.dev);
2035 rth->rt_gateway = daddr;
2036 rth->rt_spec_dst= spec_dst;
2037 rth->u.dst.input= ip_local_deliver;
2038 rth->rt_flags = flags|RTCF_LOCAL;
2039 if (res.type == RTN_UNREACHABLE) {
2040 rth->u.dst.input= ip_error;
2041 rth->u.dst.error= -err;
2042 rth->rt_flags &= ~RTCF_LOCAL;
2043 }
2044 rth->rt_type = res.type;
2045 hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
2046 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2047 goto done;
2048
2049no_route:
2050 RT_CACHE_STAT_INC(in_no_route);
2051 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2052 res.type = RTN_UNREACHABLE;
2053 goto local_input;
2054
2055 /*
2056 * Do not cache martian addresses: they should be logged (RFC1812)
2057 */
2058martian_destination:
2059 RT_CACHE_STAT_INC(in_martian_dst);
2060#ifdef CONFIG_IP_ROUTE_VERBOSE
2061 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2062 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2063 "%u.%u.%u.%u, dev %s\n",
2064 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2065#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002066
2067e_hostunreach:
2068 err = -EHOSTUNREACH;
2069 goto done;
2070
Linus Torvalds1da177e2005-04-16 15:20:36 -07002071e_inval:
2072 err = -EINVAL;
2073 goto done;
2074
2075e_nobufs:
2076 err = -ENOBUFS;
2077 goto done;
2078
2079martian_source:
2080 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2081 goto e_inval;
2082}
2083
2084int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
2085 u8 tos, struct net_device *dev)
2086{
2087 struct rtable * rth;
2088 unsigned hash;
2089 int iif = dev->ifindex;
2090
2091 tos &= IPTOS_RT_MASK;
2092 hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
2093
2094 rcu_read_lock();
2095 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2096 rth = rcu_dereference(rth->u.rt_next)) {
2097 if (rth->fl.fl4_dst == daddr &&
2098 rth->fl.fl4_src == saddr &&
2099 rth->fl.iif == iif &&
2100 rth->fl.oif == 0 &&
2101#ifdef CONFIG_IP_ROUTE_FWMARK
2102 rth->fl.fl4_fwmark == skb->nfmark &&
2103#endif
2104 rth->fl.fl4_tos == tos) {
2105 rth->u.dst.lastuse = jiffies;
2106 dst_hold(&rth->u.dst);
2107 rth->u.dst.__use++;
2108 RT_CACHE_STAT_INC(in_hit);
2109 rcu_read_unlock();
2110 skb->dst = (struct dst_entry*)rth;
2111 return 0;
2112 }
2113 RT_CACHE_STAT_INC(in_hlist_search);
2114 }
2115 rcu_read_unlock();
2116
2117 /* Multicast recognition logic is moved from route cache to here.
2118 The problem was that too many Ethernet cards have broken/missing
2119 hardware multicast filters :-( As result the host on multicasting
2120 network acquires a lot of useless route cache entries, sort of
2121 SDR messages from all the world. Now we try to get rid of them.
2122 Really, provided software IP multicast filter is organized
2123 reasonably (at least, hashed), it does not result in a slowdown
2124 comparing with route cache reject entries.
2125 Note, that multicast routers are not affected, because
2126 route cache entry is created eventually.
2127 */
2128 if (MULTICAST(daddr)) {
2129 struct in_device *in_dev;
2130
2131 rcu_read_lock();
2132 if ((in_dev = __in_dev_get(dev)) != NULL) {
2133 int our = ip_check_mc(in_dev, daddr, saddr,
2134 skb->nh.iph->protocol);
2135 if (our
2136#ifdef CONFIG_IP_MROUTE
2137 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2138#endif
2139 ) {
2140 rcu_read_unlock();
2141 return ip_route_input_mc(skb, daddr, saddr,
2142 tos, dev, our);
2143 }
2144 }
2145 rcu_read_unlock();
2146 return -EINVAL;
2147 }
2148 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2149}
2150
2151static inline int __mkroute_output(struct rtable **result,
2152 struct fib_result* res,
2153 const struct flowi *fl,
2154 const struct flowi *oldflp,
2155 struct net_device *dev_out,
2156 unsigned flags)
2157{
2158 struct rtable *rth;
2159 struct in_device *in_dev;
2160 u32 tos = RT_FL_TOS(oldflp);
2161 int err = 0;
2162
2163 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2164 return -EINVAL;
2165
2166 if (fl->fl4_dst == 0xFFFFFFFF)
2167 res->type = RTN_BROADCAST;
2168 else if (MULTICAST(fl->fl4_dst))
2169 res->type = RTN_MULTICAST;
2170 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2171 return -EINVAL;
2172
2173 if (dev_out->flags & IFF_LOOPBACK)
2174 flags |= RTCF_LOCAL;
2175
2176 /* get work reference to inet device */
2177 in_dev = in_dev_get(dev_out);
2178 if (!in_dev)
2179 return -EINVAL;
2180
2181 if (res->type == RTN_BROADCAST) {
2182 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2183 if (res->fi) {
2184 fib_info_put(res->fi);
2185 res->fi = NULL;
2186 }
2187 } else if (res->type == RTN_MULTICAST) {
2188 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2189 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2190 oldflp->proto))
2191 flags &= ~RTCF_LOCAL;
2192 /* If multicast route do not exist use
2193 default one, but do not gateway in this case.
2194 Yes, it is hack.
2195 */
2196 if (res->fi && res->prefixlen < 4) {
2197 fib_info_put(res->fi);
2198 res->fi = NULL;
2199 }
2200 }
2201
2202
2203 rth = dst_alloc(&ipv4_dst_ops);
2204 if (!rth) {
2205 err = -ENOBUFS;
2206 goto cleanup;
2207 }
2208
2209 rth->u.dst.flags= DST_HOST;
2210#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2211 if (res->fi) {
2212 rth->rt_multipath_alg = res->fi->fib_mp_alg;
2213 if (res->fi->fib_nhs > 1)
2214 rth->u.dst.flags |= DST_BALANCED;
2215 }
2216#endif
2217 if (in_dev->cnf.no_xfrm)
2218 rth->u.dst.flags |= DST_NOXFRM;
2219 if (in_dev->cnf.no_policy)
2220 rth->u.dst.flags |= DST_NOPOLICY;
2221
2222 rth->fl.fl4_dst = oldflp->fl4_dst;
2223 rth->fl.fl4_tos = tos;
2224 rth->fl.fl4_src = oldflp->fl4_src;
2225 rth->fl.oif = oldflp->oif;
2226#ifdef CONFIG_IP_ROUTE_FWMARK
2227 rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
2228#endif
2229 rth->rt_dst = fl->fl4_dst;
2230 rth->rt_src = fl->fl4_src;
2231 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2232 /* get references to the devices that are to be hold by the routing
2233 cache entry */
2234 rth->u.dst.dev = dev_out;
2235 dev_hold(dev_out);
2236 rth->idev = in_dev_get(dev_out);
2237 rth->rt_gateway = fl->fl4_dst;
2238 rth->rt_spec_dst= fl->fl4_src;
2239
2240 rth->u.dst.output=ip_output;
2241
2242 RT_CACHE_STAT_INC(out_slow_tot);
2243
2244 if (flags & RTCF_LOCAL) {
2245 rth->u.dst.input = ip_local_deliver;
2246 rth->rt_spec_dst = fl->fl4_dst;
2247 }
2248 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2249 rth->rt_spec_dst = fl->fl4_src;
2250 if (flags & RTCF_LOCAL &&
2251 !(dev_out->flags & IFF_LOOPBACK)) {
2252 rth->u.dst.output = ip_mc_output;
2253 RT_CACHE_STAT_INC(out_slow_mc);
2254 }
2255#ifdef CONFIG_IP_MROUTE
2256 if (res->type == RTN_MULTICAST) {
2257 if (IN_DEV_MFORWARD(in_dev) &&
2258 !LOCAL_MCAST(oldflp->fl4_dst)) {
2259 rth->u.dst.input = ip_mr_input;
2260 rth->u.dst.output = ip_mc_output;
2261 }
2262 }
2263#endif
2264 }
2265
2266 rt_set_nexthop(rth, res, 0);
2267
2268 rth->rt_flags = flags;
2269
2270 *result = rth;
2271 cleanup:
2272 /* release work reference to inet device */
2273 in_dev_put(in_dev);
2274
2275 return err;
2276}
2277
2278static inline int ip_mkroute_output_def(struct rtable **rp,
2279 struct fib_result* res,
2280 const struct flowi *fl,
2281 const struct flowi *oldflp,
2282 struct net_device *dev_out,
2283 unsigned flags)
2284{
Chuck Short7abaa272005-06-22 22:10:23 -07002285 struct rtable *rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002286 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2287 unsigned hash;
2288 if (err == 0) {
2289 u32 tos = RT_FL_TOS(oldflp);
2290
2291 atomic_set(&rth->u.dst.__refcnt, 1);
2292
2293 hash = rt_hash_code(oldflp->fl4_dst,
2294 oldflp->fl4_src ^ (oldflp->oif << 5), tos);
2295 err = rt_intern_hash(hash, rth, rp);
2296 }
2297
2298 return err;
2299}
2300
2301static inline int ip_mkroute_output(struct rtable** rp,
2302 struct fib_result* res,
2303 const struct flowi *fl,
2304 const struct flowi *oldflp,
2305 struct net_device *dev_out,
2306 unsigned flags)
2307{
2308#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2309 u32 tos = RT_FL_TOS(oldflp);
2310 unsigned char hop;
2311 unsigned hash;
2312 int err = -EINVAL;
Chuck Short7abaa272005-06-22 22:10:23 -07002313 struct rtable *rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002314
2315 if (res->fi && res->fi->fib_nhs > 1) {
2316 unsigned char hopcount = res->fi->fib_nhs;
2317
2318 for (hop = 0; hop < hopcount; hop++) {
2319 struct net_device *dev2nexthop;
2320
2321 res->nh_sel = hop;
2322
2323 /* hold a work reference to the output device */
2324 dev2nexthop = FIB_RES_DEV(*res);
2325 dev_hold(dev2nexthop);
2326
2327 err = __mkroute_output(&rth, res, fl, oldflp,
2328 dev2nexthop, flags);
2329
2330 if (err != 0)
2331 goto cleanup;
2332
2333 hash = rt_hash_code(oldflp->fl4_dst,
2334 oldflp->fl4_src ^
2335 (oldflp->oif << 5), tos);
2336 err = rt_intern_hash(hash, rth, rp);
2337
2338 /* forward hop information to multipath impl. */
2339 multipath_set_nhinfo(rth,
2340 FIB_RES_NETWORK(*res),
2341 FIB_RES_NETMASK(*res),
2342 res->prefixlen,
2343 &FIB_RES_NH(*res));
2344 cleanup:
2345 /* release work reference to output device */
2346 dev_put(dev2nexthop);
2347
2348 if (err != 0)
2349 return err;
2350 }
2351 atomic_set(&(*rp)->u.dst.__refcnt, 1);
2352 return err;
2353 } else {
2354 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
2355 flags);
2356 }
2357#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
2358 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
2359#endif
2360}
2361
2362/*
2363 * Major route resolver routine.
2364 */
2365
2366static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2367{
2368 u32 tos = RT_FL_TOS(oldflp);
2369 struct flowi fl = { .nl_u = { .ip4_u =
2370 { .daddr = oldflp->fl4_dst,
2371 .saddr = oldflp->fl4_src,
2372 .tos = tos & IPTOS_RT_MASK,
2373 .scope = ((tos & RTO_ONLINK) ?
2374 RT_SCOPE_LINK :
2375 RT_SCOPE_UNIVERSE),
2376#ifdef CONFIG_IP_ROUTE_FWMARK
2377 .fwmark = oldflp->fl4_fwmark
2378#endif
2379 } },
2380 .iif = loopback_dev.ifindex,
2381 .oif = oldflp->oif };
2382 struct fib_result res;
2383 unsigned flags = 0;
2384 struct net_device *dev_out = NULL;
2385 int free_res = 0;
2386 int err;
2387
2388
2389 res.fi = NULL;
2390#ifdef CONFIG_IP_MULTIPLE_TABLES
2391 res.r = NULL;
2392#endif
2393
2394 if (oldflp->fl4_src) {
2395 err = -EINVAL;
2396 if (MULTICAST(oldflp->fl4_src) ||
2397 BADCLASS(oldflp->fl4_src) ||
2398 ZERONET(oldflp->fl4_src))
2399 goto out;
2400
2401 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2402 dev_out = ip_dev_find(oldflp->fl4_src);
2403 if (dev_out == NULL)
2404 goto out;
2405
2406 /* I removed check for oif == dev_out->oif here.
2407 It was wrong for two reasons:
2408 1. ip_dev_find(saddr) can return wrong iface, if saddr is
2409 assigned to multiple interfaces.
2410 2. Moreover, we are allowed to send packets with saddr
2411 of another iface. --ANK
2412 */
2413
2414 if (oldflp->oif == 0
2415 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
2416 /* Special hack: user can direct multicasts
2417 and limited broadcast via necessary interface
2418 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2419 This hack is not just for fun, it allows
2420 vic,vat and friends to work.
2421 They bind socket to loopback, set ttl to zero
2422 and expect that it will work.
2423 From the viewpoint of routing cache they are broken,
2424 because we are not allowed to build multicast path
2425 with loopback source addr (look, routing cache
2426 cannot know, that ttl is zero, so that packet
2427 will not leave this host and route is valid).
2428 Luckily, this hack is good workaround.
2429 */
2430
2431 fl.oif = dev_out->ifindex;
2432 goto make_route;
2433 }
2434 if (dev_out)
2435 dev_put(dev_out);
2436 dev_out = NULL;
2437 }
2438
2439
2440 if (oldflp->oif) {
2441 dev_out = dev_get_by_index(oldflp->oif);
2442 err = -ENODEV;
2443 if (dev_out == NULL)
2444 goto out;
2445 if (__in_dev_get(dev_out) == NULL) {
2446 dev_put(dev_out);
2447 goto out; /* Wrong error code */
2448 }
2449
2450 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
2451 if (!fl.fl4_src)
2452 fl.fl4_src = inet_select_addr(dev_out, 0,
2453 RT_SCOPE_LINK);
2454 goto make_route;
2455 }
2456 if (!fl.fl4_src) {
2457 if (MULTICAST(oldflp->fl4_dst))
2458 fl.fl4_src = inet_select_addr(dev_out, 0,
2459 fl.fl4_scope);
2460 else if (!oldflp->fl4_dst)
2461 fl.fl4_src = inet_select_addr(dev_out, 0,
2462 RT_SCOPE_HOST);
2463 }
2464 }
2465
2466 if (!fl.fl4_dst) {
2467 fl.fl4_dst = fl.fl4_src;
2468 if (!fl.fl4_dst)
2469 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2470 if (dev_out)
2471 dev_put(dev_out);
2472 dev_out = &loopback_dev;
2473 dev_hold(dev_out);
2474 fl.oif = loopback_dev.ifindex;
2475 res.type = RTN_LOCAL;
2476 flags |= RTCF_LOCAL;
2477 goto make_route;
2478 }
2479
2480 if (fib_lookup(&fl, &res)) {
2481 res.fi = NULL;
2482 if (oldflp->oif) {
2483 /* Apparently, routing tables are wrong. Assume,
2484 that the destination is on link.
2485
2486 WHY? DW.
2487 Because we are allowed to send to iface
2488 even if it has NO routes and NO assigned
2489 addresses. When oif is specified, routing
2490 tables are looked up with only one purpose:
2491 to catch if destination is gatewayed, rather than
2492 direct. Moreover, if MSG_DONTROUTE is set,
2493 we send packet, ignoring both routing tables
2494 and ifaddr state. --ANK
2495
2496
2497 We could make it even if oif is unknown,
2498 likely IPv6, but we do not.
2499 */
2500
2501 if (fl.fl4_src == 0)
2502 fl.fl4_src = inet_select_addr(dev_out, 0,
2503 RT_SCOPE_LINK);
2504 res.type = RTN_UNICAST;
2505 goto make_route;
2506 }
2507 if (dev_out)
2508 dev_put(dev_out);
2509 err = -ENETUNREACH;
2510 goto out;
2511 }
2512 free_res = 1;
2513
2514 if (res.type == RTN_LOCAL) {
2515 if (!fl.fl4_src)
2516 fl.fl4_src = fl.fl4_dst;
2517 if (dev_out)
2518 dev_put(dev_out);
2519 dev_out = &loopback_dev;
2520 dev_hold(dev_out);
2521 fl.oif = dev_out->ifindex;
2522 if (res.fi)
2523 fib_info_put(res.fi);
2524 res.fi = NULL;
2525 flags |= RTCF_LOCAL;
2526 goto make_route;
2527 }
2528
2529#ifdef CONFIG_IP_ROUTE_MULTIPATH
2530 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2531 fib_select_multipath(&fl, &res);
2532 else
2533#endif
2534 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2535 fib_select_default(&fl, &res);
2536
2537 if (!fl.fl4_src)
2538 fl.fl4_src = FIB_RES_PREFSRC(res);
2539
2540 if (dev_out)
2541 dev_put(dev_out);
2542 dev_out = FIB_RES_DEV(res);
2543 dev_hold(dev_out);
2544 fl.oif = dev_out->ifindex;
2545
2546
2547make_route:
2548 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2549
2550
2551 if (free_res)
2552 fib_res_put(&res);
2553 if (dev_out)
2554 dev_put(dev_out);
2555out: return err;
2556}
2557
2558int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2559{
2560 unsigned hash;
2561 struct rtable *rth;
2562
2563 hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
2564
2565 rcu_read_lock_bh();
2566 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2567 rth = rcu_dereference(rth->u.rt_next)) {
2568 if (rth->fl.fl4_dst == flp->fl4_dst &&
2569 rth->fl.fl4_src == flp->fl4_src &&
2570 rth->fl.iif == 0 &&
2571 rth->fl.oif == flp->oif &&
2572#ifdef CONFIG_IP_ROUTE_FWMARK
2573 rth->fl.fl4_fwmark == flp->fl4_fwmark &&
2574#endif
2575 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2576 (IPTOS_RT_MASK | RTO_ONLINK))) {
2577
2578 /* check for multipath routes and choose one if
2579 * necessary
2580 */
2581 if (multipath_select_route(flp, rth, rp)) {
2582 dst_hold(&(*rp)->u.dst);
2583 RT_CACHE_STAT_INC(out_hit);
2584 rcu_read_unlock_bh();
2585 return 0;
2586 }
2587
2588 rth->u.dst.lastuse = jiffies;
2589 dst_hold(&rth->u.dst);
2590 rth->u.dst.__use++;
2591 RT_CACHE_STAT_INC(out_hit);
2592 rcu_read_unlock_bh();
2593 *rp = rth;
2594 return 0;
2595 }
2596 RT_CACHE_STAT_INC(out_hlist_search);
2597 }
2598 rcu_read_unlock_bh();
2599
2600 return ip_route_output_slow(rp, flp);
2601}
2602
2603int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2604{
2605 int err;
2606
2607 if ((err = __ip_route_output_key(rp, flp)) != 0)
2608 return err;
2609
2610 if (flp->proto) {
2611 if (!flp->fl4_src)
2612 flp->fl4_src = (*rp)->rt_src;
2613 if (!flp->fl4_dst)
2614 flp->fl4_dst = (*rp)->rt_dst;
2615 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2616 }
2617
2618 return 0;
2619}
2620
2621int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2622{
2623 return ip_route_output_flow(rp, flp, NULL, 0);
2624}
2625
2626static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002627 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002628{
2629 struct rtable *rt = (struct rtable*)skb->dst;
2630 struct rtmsg *r;
2631 struct nlmsghdr *nlh;
2632 unsigned char *b = skb->tail;
2633 struct rta_cacheinfo ci;
2634#ifdef CONFIG_IP_MROUTE
2635 struct rtattr *eptr;
2636#endif
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002637 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002638 r = NLMSG_DATA(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002639 r->rtm_family = AF_INET;
2640 r->rtm_dst_len = 32;
2641 r->rtm_src_len = 0;
2642 r->rtm_tos = rt->fl.fl4_tos;
2643 r->rtm_table = RT_TABLE_MAIN;
2644 r->rtm_type = rt->rt_type;
2645 r->rtm_scope = RT_SCOPE_UNIVERSE;
2646 r->rtm_protocol = RTPROT_UNSPEC;
2647 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2648 if (rt->rt_flags & RTCF_NOTIFY)
2649 r->rtm_flags |= RTM_F_NOTIFY;
2650 RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
2651 if (rt->fl.fl4_src) {
2652 r->rtm_src_len = 32;
2653 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
2654 }
2655 if (rt->u.dst.dev)
2656 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
2657#ifdef CONFIG_NET_CLS_ROUTE
2658 if (rt->u.dst.tclassid)
2659 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
2660#endif
2661#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
2662 if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
2663 __u32 alg = rt->rt_multipath_alg;
2664
2665 RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
2666 }
2667#endif
2668 if (rt->fl.iif)
2669 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
2670 else if (rt->rt_src != rt->fl.fl4_src)
2671 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
2672 if (rt->rt_dst != rt->rt_gateway)
2673 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
2674 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2675 goto rtattr_failure;
2676 ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2677 ci.rta_used = rt->u.dst.__use;
2678 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2679 if (rt->u.dst.expires)
2680 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
2681 else
2682 ci.rta_expires = 0;
2683 ci.rta_error = rt->u.dst.error;
2684 ci.rta_id = ci.rta_ts = ci.rta_tsage = 0;
2685 if (rt->peer) {
2686 ci.rta_id = rt->peer->ip_id_count;
2687 if (rt->peer->tcp_ts_stamp) {
2688 ci.rta_ts = rt->peer->tcp_ts;
2689 ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
2690 }
2691 }
2692#ifdef CONFIG_IP_MROUTE
2693 eptr = (struct rtattr*)skb->tail;
2694#endif
2695 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2696 if (rt->fl.iif) {
2697#ifdef CONFIG_IP_MROUTE
2698 u32 dst = rt->rt_dst;
2699
2700 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2701 ipv4_devconf.mc_forwarding) {
2702 int err = ipmr_get_route(skb, r, nowait);
2703 if (err <= 0) {
2704 if (!nowait) {
2705 if (err == 0)
2706 return 0;
2707 goto nlmsg_failure;
2708 } else {
2709 if (err == -EMSGSIZE)
2710 goto nlmsg_failure;
2711 ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
2712 }
2713 }
2714 } else
2715#endif
2716 RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
2717 }
2718
2719 nlh->nlmsg_len = skb->tail - b;
2720 return skb->len;
2721
2722nlmsg_failure:
2723rtattr_failure:
2724 skb_trim(skb, b - skb->data);
2725 return -1;
2726}
2727
2728int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2729{
2730 struct rtattr **rta = arg;
2731 struct rtmsg *rtm = NLMSG_DATA(nlh);
2732 struct rtable *rt = NULL;
2733 u32 dst = 0;
2734 u32 src = 0;
2735 int iif = 0;
2736 int err = -ENOBUFS;
2737 struct sk_buff *skb;
2738
2739 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2740 if (!skb)
2741 goto out;
2742
2743 /* Reserve room for dummy headers, this skb can pass
2744 through good chunk of routing engine.
2745 */
2746 skb->mac.raw = skb->data;
2747 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2748
2749 if (rta[RTA_SRC - 1])
2750 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
2751 if (rta[RTA_DST - 1])
2752 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
2753 if (rta[RTA_IIF - 1])
2754 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
2755
2756 if (iif) {
2757 struct net_device *dev = __dev_get_by_index(iif);
2758 err = -ENODEV;
2759 if (!dev)
2760 goto out_free;
2761 skb->protocol = htons(ETH_P_IP);
2762 skb->dev = dev;
2763 local_bh_disable();
2764 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2765 local_bh_enable();
2766 rt = (struct rtable*)skb->dst;
2767 if (!err && rt->u.dst.error)
2768 err = -rt->u.dst.error;
2769 } else {
2770 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
2771 .saddr = src,
2772 .tos = rtm->rtm_tos } } };
2773 int oif = 0;
2774 if (rta[RTA_OIF - 1])
2775 memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
2776 fl.oif = oif;
2777 err = ip_route_output_key(&rt, &fl);
2778 }
2779 if (err)
2780 goto out_free;
2781
2782 skb->dst = &rt->u.dst;
2783 if (rtm->rtm_flags & RTM_F_NOTIFY)
2784 rt->rt_flags |= RTCF_NOTIFY;
2785
2786 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2787
2788 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002789 RTM_NEWROUTE, 0, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002790 if (!err)
2791 goto out_free;
2792 if (err < 0) {
2793 err = -EMSGSIZE;
2794 goto out_free;
2795 }
2796
2797 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2798 if (err > 0)
2799 err = 0;
2800out: return err;
2801
2802out_free:
2803 kfree_skb(skb);
2804 goto out;
2805}
2806
2807int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2808{
2809 struct rtable *rt;
2810 int h, s_h;
2811 int idx, s_idx;
2812
2813 s_h = cb->args[0];
2814 s_idx = idx = cb->args[1];
2815 for (h = 0; h <= rt_hash_mask; h++) {
2816 if (h < s_h) continue;
2817 if (h > s_h)
2818 s_idx = 0;
2819 rcu_read_lock_bh();
2820 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2821 rt = rcu_dereference(rt->u.rt_next), idx++) {
2822 if (idx < s_idx)
2823 continue;
2824 skb->dst = dst_clone(&rt->u.dst);
2825 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002826 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2827 1, NLM_F_MULTI) <= 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002828 dst_release(xchg(&skb->dst, NULL));
2829 rcu_read_unlock_bh();
2830 goto done;
2831 }
2832 dst_release(xchg(&skb->dst, NULL));
2833 }
2834 rcu_read_unlock_bh();
2835 }
2836
2837done:
2838 cb->args[0] = h;
2839 cb->args[1] = idx;
2840 return skb->len;
2841}
2842
2843void ip_rt_multicast_event(struct in_device *in_dev)
2844{
2845 rt_cache_flush(0);
2846}
2847
2848#ifdef CONFIG_SYSCTL
2849static int flush_delay;
2850
2851static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2852 struct file *filp, void __user *buffer,
2853 size_t *lenp, loff_t *ppos)
2854{
2855 if (write) {
2856 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2857 rt_cache_flush(flush_delay);
2858 return 0;
2859 }
2860
2861 return -EINVAL;
2862}
2863
2864static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2865 int __user *name,
2866 int nlen,
2867 void __user *oldval,
2868 size_t __user *oldlenp,
2869 void __user *newval,
2870 size_t newlen,
2871 void **context)
2872{
2873 int delay;
2874 if (newlen != sizeof(int))
2875 return -EINVAL;
2876 if (get_user(delay, (int __user *)newval))
2877 return -EFAULT;
2878 rt_cache_flush(delay);
2879 return 0;
2880}
2881
2882ctl_table ipv4_route_table[] = {
2883 {
2884 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2885 .procname = "flush",
2886 .data = &flush_delay,
2887 .maxlen = sizeof(int),
Dave Jones7e3e0362005-04-28 12:11:03 -07002888 .mode = 0200,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002889 .proc_handler = &ipv4_sysctl_rtcache_flush,
2890 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2891 },
2892 {
2893 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2894 .procname = "min_delay",
2895 .data = &ip_rt_min_delay,
2896 .maxlen = sizeof(int),
2897 .mode = 0644,
2898 .proc_handler = &proc_dointvec_jiffies,
2899 .strategy = &sysctl_jiffies,
2900 },
2901 {
2902 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2903 .procname = "max_delay",
2904 .data = &ip_rt_max_delay,
2905 .maxlen = sizeof(int),
2906 .mode = 0644,
2907 .proc_handler = &proc_dointvec_jiffies,
2908 .strategy = &sysctl_jiffies,
2909 },
2910 {
2911 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2912 .procname = "gc_thresh",
2913 .data = &ipv4_dst_ops.gc_thresh,
2914 .maxlen = sizeof(int),
2915 .mode = 0644,
2916 .proc_handler = &proc_dointvec,
2917 },
2918 {
2919 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2920 .procname = "max_size",
2921 .data = &ip_rt_max_size,
2922 .maxlen = sizeof(int),
2923 .mode = 0644,
2924 .proc_handler = &proc_dointvec,
2925 },
2926 {
2927 /* Deprecated. Use gc_min_interval_ms */
2928
2929 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2930 .procname = "gc_min_interval",
2931 .data = &ip_rt_gc_min_interval,
2932 .maxlen = sizeof(int),
2933 .mode = 0644,
2934 .proc_handler = &proc_dointvec_jiffies,
2935 .strategy = &sysctl_jiffies,
2936 },
2937 {
2938 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2939 .procname = "gc_min_interval_ms",
2940 .data = &ip_rt_gc_min_interval,
2941 .maxlen = sizeof(int),
2942 .mode = 0644,
2943 .proc_handler = &proc_dointvec_ms_jiffies,
2944 .strategy = &sysctl_ms_jiffies,
2945 },
2946 {
2947 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2948 .procname = "gc_timeout",
2949 .data = &ip_rt_gc_timeout,
2950 .maxlen = sizeof(int),
2951 .mode = 0644,
2952 .proc_handler = &proc_dointvec_jiffies,
2953 .strategy = &sysctl_jiffies,
2954 },
2955 {
2956 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2957 .procname = "gc_interval",
2958 .data = &ip_rt_gc_interval,
2959 .maxlen = sizeof(int),
2960 .mode = 0644,
2961 .proc_handler = &proc_dointvec_jiffies,
2962 .strategy = &sysctl_jiffies,
2963 },
2964 {
2965 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2966 .procname = "redirect_load",
2967 .data = &ip_rt_redirect_load,
2968 .maxlen = sizeof(int),
2969 .mode = 0644,
2970 .proc_handler = &proc_dointvec,
2971 },
2972 {
2973 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2974 .procname = "redirect_number",
2975 .data = &ip_rt_redirect_number,
2976 .maxlen = sizeof(int),
2977 .mode = 0644,
2978 .proc_handler = &proc_dointvec,
2979 },
2980 {
2981 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2982 .procname = "redirect_silence",
2983 .data = &ip_rt_redirect_silence,
2984 .maxlen = sizeof(int),
2985 .mode = 0644,
2986 .proc_handler = &proc_dointvec,
2987 },
2988 {
2989 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2990 .procname = "error_cost",
2991 .data = &ip_rt_error_cost,
2992 .maxlen = sizeof(int),
2993 .mode = 0644,
2994 .proc_handler = &proc_dointvec,
2995 },
2996 {
2997 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2998 .procname = "error_burst",
2999 .data = &ip_rt_error_burst,
3000 .maxlen = sizeof(int),
3001 .mode = 0644,
3002 .proc_handler = &proc_dointvec,
3003 },
3004 {
3005 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
3006 .procname = "gc_elasticity",
3007 .data = &ip_rt_gc_elasticity,
3008 .maxlen = sizeof(int),
3009 .mode = 0644,
3010 .proc_handler = &proc_dointvec,
3011 },
3012 {
3013 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
3014 .procname = "mtu_expires",
3015 .data = &ip_rt_mtu_expires,
3016 .maxlen = sizeof(int),
3017 .mode = 0644,
3018 .proc_handler = &proc_dointvec_jiffies,
3019 .strategy = &sysctl_jiffies,
3020 },
3021 {
3022 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
3023 .procname = "min_pmtu",
3024 .data = &ip_rt_min_pmtu,
3025 .maxlen = sizeof(int),
3026 .mode = 0644,
3027 .proc_handler = &proc_dointvec,
3028 },
3029 {
3030 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
3031 .procname = "min_adv_mss",
3032 .data = &ip_rt_min_advmss,
3033 .maxlen = sizeof(int),
3034 .mode = 0644,
3035 .proc_handler = &proc_dointvec,
3036 },
3037 {
3038 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3039 .procname = "secret_interval",
3040 .data = &ip_rt_secret_interval,
3041 .maxlen = sizeof(int),
3042 .mode = 0644,
3043 .proc_handler = &proc_dointvec_jiffies,
3044 .strategy = &sysctl_jiffies,
3045 },
3046 { .ctl_name = 0 }
3047};
3048#endif
3049
3050#ifdef CONFIG_NET_CLS_ROUTE
3051struct ip_rt_acct *ip_rt_acct;
3052
3053/* This code sucks. But you should have seen it before! --RR */
3054
3055/* IP route accounting ptr for this logical cpu number. */
3056#define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
3057
3058#ifdef CONFIG_PROC_FS
3059static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
3060 int length, int *eof, void *data)
3061{
3062 unsigned int i;
3063
3064 if ((offset & 3) || (length & 3))
3065 return -EIO;
3066
3067 if (offset >= sizeof(struct ip_rt_acct) * 256) {
3068 *eof = 1;
3069 return 0;
3070 }
3071
3072 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
3073 length = sizeof(struct ip_rt_acct) * 256 - offset;
3074 *eof = 1;
3075 }
3076
3077 offset /= sizeof(u32);
3078
3079 if (length > 0) {
3080 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
3081 u32 *dst = (u32 *) buffer;
3082
3083 /* Copy first cpu. */
3084 *start = buffer;
3085 memcpy(dst, src, length);
3086
3087 /* Add the other cpus in, one int at a time */
3088 for_each_cpu(i) {
3089 unsigned int j;
3090
3091 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
3092
3093 for (j = 0; j < length/4; j++)
3094 dst[j] += src[j];
3095 }
3096 }
3097 return length;
3098}
3099#endif /* CONFIG_PROC_FS */
3100#endif /* CONFIG_NET_CLS_ROUTE */
3101
3102static __initdata unsigned long rhash_entries;
3103static int __init set_rhash_entries(char *str)
3104{
3105 if (!str)
3106 return 0;
3107 rhash_entries = simple_strtoul(str, &str, 0);
3108 return 1;
3109}
3110__setup("rhash_entries=", set_rhash_entries);
3111
3112int __init ip_rt_init(void)
3113{
Eric Dumazet424c4b72005-07-05 14:58:19 -07003114 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003115
3116 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3117 (jiffies ^ (jiffies >> 7)));
3118
3119#ifdef CONFIG_NET_CLS_ROUTE
Eric Dumazet424c4b72005-07-05 14:58:19 -07003120 {
3121 int order;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003122 for (order = 0;
3123 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3124 /* NOTHING */;
3125 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
3126 if (!ip_rt_acct)
3127 panic("IP: failed to allocate ip_rt_acct\n");
3128 memset(ip_rt_acct, 0, PAGE_SIZE << order);
Eric Dumazet424c4b72005-07-05 14:58:19 -07003129 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003130#endif
3131
3132 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
3133 sizeof(struct rtable),
3134 0, SLAB_HWCACHE_ALIGN,
3135 NULL, NULL);
3136
3137 if (!ipv4_dst_ops.kmem_cachep)
3138 panic("IP: failed to allocate ip_dst_cache\n");
3139
Eric Dumazet424c4b72005-07-05 14:58:19 -07003140 rt_hash_table = (struct rt_hash_bucket *)
3141 alloc_large_system_hash("IP route cache",
3142 sizeof(struct rt_hash_bucket),
3143 rhash_entries,
3144 (num_physpages >= 128 * 1024) ?
3145 (27 - PAGE_SHIFT) :
3146 (29 - PAGE_SHIFT),
3147 HASH_HIGHMEM,
3148 &rt_hash_log,
3149 &rt_hash_mask,
3150 0);
Eric Dumazet22c047c2005-07-05 14:55:24 -07003151 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3152 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003153
3154 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3155 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3156
3157 rt_cache_stat = alloc_percpu(struct rt_cache_stat);
3158 if (!rt_cache_stat)
3159 return -ENOMEM;
3160
3161 devinet_init();
3162 ip_fib_init();
3163
3164 init_timer(&rt_flush_timer);
3165 rt_flush_timer.function = rt_run_flush;
3166 init_timer(&rt_periodic_timer);
3167 rt_periodic_timer.function = rt_check_expire;
3168 init_timer(&rt_secret_timer);
3169 rt_secret_timer.function = rt_secret_rebuild;
3170
3171 /* All the timers, started at system startup tend
3172 to synchronize. Perturb it a bit.
3173 */
3174 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
3175 ip_rt_gc_interval;
3176 add_timer(&rt_periodic_timer);
3177
3178 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3179 ip_rt_secret_interval;
3180 add_timer(&rt_secret_timer);
3181
3182#ifdef CONFIG_PROC_FS
3183 {
3184 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3185 if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3186 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3187 proc_net_stat))) {
3188 free_percpu(rt_cache_stat);
3189 return -ENOMEM;
3190 }
3191 rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3192 }
3193#ifdef CONFIG_NET_CLS_ROUTE
3194 create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
3195#endif
3196#endif
3197#ifdef CONFIG_XFRM
3198 xfrm_init();
3199 xfrm4_init();
3200#endif
3201 return rc;
3202}
3203
3204EXPORT_SYMBOL(__ip_select_ident);
3205EXPORT_SYMBOL(ip_route_input);
3206EXPORT_SYMBOL(ip_route_output_key);