blob: 9736f6895628e06d507dccb337db7f3283f87ad2 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9 *
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 *
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090023 * Alan Cox : Super /proc >4K
Linus Torvalds1da177e2005-04-16 15:20:36 -070024 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090041 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070042 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070057 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Ilia Sotnikovcef26852006-03-25 01:38:55 -080058 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
59 * Ilia Sotnikov : Removed TOS from hash calculations
Linus Torvalds1da177e2005-04-16 15:20:36 -070060 *
61 * This program is free software; you can redistribute it and/or
62 * modify it under the terms of the GNU General Public License
63 * as published by the Free Software Foundation; either version
64 * 2 of the License, or (at your option) any later version.
65 */
66
Linus Torvalds1da177e2005-04-16 15:20:36 -070067#include <linux/module.h>
68#include <asm/uaccess.h>
69#include <asm/system.h>
70#include <linux/bitops.h>
71#include <linux/types.h>
72#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070073#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070074#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070075#include <linux/string.h>
76#include <linux/socket.h>
77#include <linux/sockios.h>
78#include <linux/errno.h>
79#include <linux/in.h>
80#include <linux/inet.h>
81#include <linux/netdevice.h>
82#include <linux/proc_fs.h>
83#include <linux/init.h>
Eric Dumazet39c90ec2007-09-15 10:55:54 -070084#include <linux/workqueue.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070085#include <linux/skbuff.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070086#include <linux/inetdevice.h>
87#include <linux/igmp.h>
88#include <linux/pkt_sched.h>
89#include <linux/mroute.h>
90#include <linux/netfilter_ipv4.h>
91#include <linux/random.h>
92#include <linux/jhash.h>
93#include <linux/rcupdate.h>
94#include <linux/times.h>
Herbert Xu352e5122007-11-13 21:34:06 -080095#include <net/dst.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020096#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070097#include <net/protocol.h>
98#include <net/ip.h>
99#include <net/route.h>
100#include <net/inetpeer.h>
101#include <net/sock.h>
102#include <net/ip_fib.h>
103#include <net/arp.h>
104#include <net/tcp.h>
105#include <net/icmp.h>
106#include <net/xfrm.h>
Tom Tucker8d717402006-07-30 20:43:36 -0700107#include <net/netevent.h>
Thomas Graf63f34442007-03-22 11:55:17 -0700108#include <net/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700109#ifdef CONFIG_SYSCTL
110#include <linux/sysctl.h>
111#endif
112
113#define RT_FL_TOS(oldflp) \
114 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115
116#define IP_MAX_MTU 0xFFF0
117
118#define RT_GC_TIMEOUT (300*HZ)
119
120static int ip_rt_min_delay = 2 * HZ;
121static int ip_rt_max_delay = 10 * HZ;
122static int ip_rt_max_size;
123static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
124static int ip_rt_gc_interval = 60 * HZ;
125static int ip_rt_gc_min_interval = HZ / 2;
126static int ip_rt_redirect_number = 9;
127static int ip_rt_redirect_load = HZ / 50;
128static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
129static int ip_rt_error_cost = HZ;
130static int ip_rt_error_burst = 5 * HZ;
131static int ip_rt_gc_elasticity = 8;
132static int ip_rt_mtu_expires = 10 * 60 * HZ;
133static int ip_rt_min_pmtu = 512 + 20 + 20;
134static int ip_rt_min_advmss = 256;
135static int ip_rt_secret_interval = 10 * 60 * HZ;
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800136static int ip_rt_flush_expected;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700137static unsigned long rt_deadline;
138
139#define RTprint(a...) printk(KERN_DEBUG a)
140
141static struct timer_list rt_flush_timer;
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800142static void rt_worker_func(struct work_struct *work);
143static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700144static struct timer_list rt_secret_timer;
145
146/*
147 * Interface to generic destination cache.
148 */
149
150static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
151static void ipv4_dst_destroy(struct dst_entry *dst);
152static void ipv4_dst_ifdown(struct dst_entry *dst,
153 struct net_device *dev, int how);
154static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
155static void ipv4_link_failure(struct sk_buff *skb);
156static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
157static int rt_garbage_collect(void);
158
159
160static struct dst_ops ipv4_dst_ops = {
161 .family = AF_INET,
162 .protocol = __constant_htons(ETH_P_IP),
163 .gc = rt_garbage_collect,
164 .check = ipv4_dst_check,
165 .destroy = ipv4_dst_destroy,
166 .ifdown = ipv4_dst_ifdown,
167 .negative_advice = ipv4_negative_advice,
168 .link_failure = ipv4_link_failure,
169 .update_pmtu = ip_rt_update_pmtu,
Herbert Xu862b82c2007-11-13 21:43:11 -0800170 .local_out = ip_local_out,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700171 .entry_size = sizeof(struct rtable),
172};
173
174#define ECN_OR_COST(class) TC_PRIO_##class
175
Philippe De Muyter4839c522007-07-09 15:32:57 -0700176const __u8 ip_tos2prio[16] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700177 TC_PRIO_BESTEFFORT,
178 ECN_OR_COST(FILLER),
179 TC_PRIO_BESTEFFORT,
180 ECN_OR_COST(BESTEFFORT),
181 TC_PRIO_BULK,
182 ECN_OR_COST(BULK),
183 TC_PRIO_BULK,
184 ECN_OR_COST(BULK),
185 TC_PRIO_INTERACTIVE,
186 ECN_OR_COST(INTERACTIVE),
187 TC_PRIO_INTERACTIVE,
188 ECN_OR_COST(INTERACTIVE),
189 TC_PRIO_INTERACTIVE_BULK,
190 ECN_OR_COST(INTERACTIVE_BULK),
191 TC_PRIO_INTERACTIVE_BULK,
192 ECN_OR_COST(INTERACTIVE_BULK)
193};
194
195
196/*
197 * Route cache.
198 */
199
200/* The locking scheme is rather straight forward:
201 *
202 * 1) Read-Copy Update protects the buckets of the central route hash.
203 * 2) Only writers remove entries, and they hold the lock
204 * as they look at rtable reference counts.
205 * 3) Only readers acquire references to rtable entries,
206 * they do so with atomic increments and with the
207 * lock held.
208 */
209
210struct rt_hash_bucket {
211 struct rtable *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700212};
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700213#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
214 defined(CONFIG_PROVE_LOCKING)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700215/*
216 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
217 * The size of this table is a power of two and depends on the number of CPUS.
Ingo Molnar62051202006-07-03 00:24:59 -0700218 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700219 */
Ingo Molnar62051202006-07-03 00:24:59 -0700220#ifdef CONFIG_LOCKDEP
221# define RT_HASH_LOCK_SZ 256
Eric Dumazet22c047c2005-07-05 14:55:24 -0700222#else
Ingo Molnar62051202006-07-03 00:24:59 -0700223# if NR_CPUS >= 32
224# define RT_HASH_LOCK_SZ 4096
225# elif NR_CPUS >= 16
226# define RT_HASH_LOCK_SZ 2048
227# elif NR_CPUS >= 8
228# define RT_HASH_LOCK_SZ 1024
229# elif NR_CPUS >= 4
230# define RT_HASH_LOCK_SZ 512
231# else
232# define RT_HASH_LOCK_SZ 256
233# endif
Eric Dumazet22c047c2005-07-05 14:55:24 -0700234#endif
235
236static spinlock_t *rt_hash_locks;
237# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
238# define rt_hash_lock_init() { \
239 int i; \
240 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
241 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
242 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
243 spin_lock_init(&rt_hash_locks[i]); \
244 }
245#else
246# define rt_hash_lock_addr(slot) NULL
247# define rt_hash_lock_init()
248#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700249
250static struct rt_hash_bucket *rt_hash_table;
251static unsigned rt_hash_mask;
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700252static unsigned int rt_hash_log;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700253static unsigned int rt_hash_rnd;
254
Eric Dumazet2f970d82006-01-17 02:54:36 -0800255static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
Andrew Mortondbd29152006-01-17 21:58:01 -0800256#define RT_CACHE_STAT_INC(field) \
Paul Mackerrasbfe5d832006-06-25 05:47:14 -0700257 (__raw_get_cpu_var(rt_cache_stat).field++)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700258
259static int rt_intern_hash(unsigned hash, struct rtable *rth,
260 struct rtable **res);
261
Ilia Sotnikovcef26852006-03-25 01:38:55 -0800262static unsigned int rt_hash_code(u32 daddr, u32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700263{
Ilia Sotnikovcef26852006-03-25 01:38:55 -0800264 return (jhash_2words(daddr, saddr, rt_hash_rnd)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700265 & rt_hash_mask);
266}
267
Al Viro8c7bc842006-09-26 21:26:19 -0700268#define rt_hash(daddr, saddr, idx) \
269 rt_hash_code((__force u32)(__be32)(daddr),\
270 (__force u32)(__be32)(saddr) ^ ((idx) << 5))
271
Linus Torvalds1da177e2005-04-16 15:20:36 -0700272#ifdef CONFIG_PROC_FS
273struct rt_cache_iter_state {
274 int bucket;
275};
276
277static struct rtable *rt_cache_get_first(struct seq_file *seq)
278{
279 struct rtable *r = NULL;
280 struct rt_cache_iter_state *st = seq->private;
281
282 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
283 rcu_read_lock_bh();
284 r = rt_hash_table[st->bucket].chain;
285 if (r)
286 break;
287 rcu_read_unlock_bh();
288 }
Eric Dumazet0bccead2008-01-10 03:55:57 -0800289 return rcu_dereference(r);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700290}
291
292static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
293{
Eric Dumazet0bccead2008-01-10 03:55:57 -0800294 struct rt_cache_iter_state *st = seq->private;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700295
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800296 r = r->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700297 while (!r) {
298 rcu_read_unlock_bh();
299 if (--st->bucket < 0)
300 break;
301 rcu_read_lock_bh();
302 r = rt_hash_table[st->bucket].chain;
303 }
Eric Dumazet0bccead2008-01-10 03:55:57 -0800304 return rcu_dereference(r);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700305}
306
307static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
308{
309 struct rtable *r = rt_cache_get_first(seq);
310
311 if (r)
312 while (pos && (r = rt_cache_get_next(seq, r)))
313 --pos;
314 return pos ? NULL : r;
315}
316
317static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
318{
319 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
320}
321
322static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
323{
324 struct rtable *r = NULL;
325
326 if (v == SEQ_START_TOKEN)
327 r = rt_cache_get_first(seq);
328 else
329 r = rt_cache_get_next(seq, v);
330 ++*pos;
331 return r;
332}
333
334static void rt_cache_seq_stop(struct seq_file *seq, void *v)
335{
336 if (v && v != SEQ_START_TOKEN)
337 rcu_read_unlock_bh();
338}
339
340static int rt_cache_seq_show(struct seq_file *seq, void *v)
341{
342 if (v == SEQ_START_TOKEN)
343 seq_printf(seq, "%-127s\n",
344 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
345 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
346 "HHUptod\tSpecDst");
347 else {
348 struct rtable *r = v;
349 char temp[256];
350
351 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
352 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
353 r->u.dst.dev ? r->u.dst.dev->name : "*",
354 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
355 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
356 r->u.dst.__use, 0, (unsigned long)r->rt_src,
357 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
358 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
359 dst_metric(&r->u.dst, RTAX_WINDOW),
360 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
361 dst_metric(&r->u.dst, RTAX_RTTVAR)),
362 r->fl.fl4_tos,
363 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
364 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
365 dev_queue_xmit) : 0,
366 r->rt_spec_dst);
367 seq_printf(seq, "%-127s\n", temp);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900368 }
369 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700370}
371
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700372static const struct seq_operations rt_cache_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700373 .start = rt_cache_seq_start,
374 .next = rt_cache_seq_next,
375 .stop = rt_cache_seq_stop,
376 .show = rt_cache_seq_show,
377};
378
379static int rt_cache_seq_open(struct inode *inode, struct file *file)
380{
Pavel Emelyanovcf7732e2007-10-10 02:29:29 -0700381 return seq_open_private(file, &rt_cache_seq_ops,
382 sizeof(struct rt_cache_iter_state));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700383}
384
Arjan van de Ven9a321442007-02-12 00:55:35 -0800385static const struct file_operations rt_cache_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700386 .owner = THIS_MODULE,
387 .open = rt_cache_seq_open,
388 .read = seq_read,
389 .llseek = seq_lseek,
390 .release = seq_release_private,
391};
392
393
394static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
395{
396 int cpu;
397
398 if (*pos == 0)
399 return SEQ_START_TOKEN;
400
401 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
402 if (!cpu_possible(cpu))
403 continue;
404 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800405 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700406 }
407 return NULL;
408}
409
410static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
411{
412 int cpu;
413
414 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
415 if (!cpu_possible(cpu))
416 continue;
417 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800418 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700419 }
420 return NULL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900421
Linus Torvalds1da177e2005-04-16 15:20:36 -0700422}
423
424static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
425{
426
427}
428
429static int rt_cpu_seq_show(struct seq_file *seq, void *v)
430{
431 struct rt_cache_stat *st = v;
432
433 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700434 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700435 return 0;
436 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900437
Linus Torvalds1da177e2005-04-16 15:20:36 -0700438 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
439 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
440 atomic_read(&ipv4_dst_ops.entries),
441 st->in_hit,
442 st->in_slow_tot,
443 st->in_slow_mc,
444 st->in_no_route,
445 st->in_brd,
446 st->in_martian_dst,
447 st->in_martian_src,
448
449 st->out_hit,
450 st->out_slow_tot,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900451 st->out_slow_mc,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700452
453 st->gc_total,
454 st->gc_ignored,
455 st->gc_goal_miss,
456 st->gc_dst_overflow,
457 st->in_hlist_search,
458 st->out_hlist_search
459 );
460 return 0;
461}
462
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700463static const struct seq_operations rt_cpu_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700464 .start = rt_cpu_seq_start,
465 .next = rt_cpu_seq_next,
466 .stop = rt_cpu_seq_stop,
467 .show = rt_cpu_seq_show,
468};
469
470
471static int rt_cpu_seq_open(struct inode *inode, struct file *file)
472{
473 return seq_open(file, &rt_cpu_seq_ops);
474}
475
Arjan van de Ven9a321442007-02-12 00:55:35 -0800476static const struct file_operations rt_cpu_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700477 .owner = THIS_MODULE,
478 .open = rt_cpu_seq_open,
479 .read = seq_read,
480 .llseek = seq_lseek,
481 .release = seq_release,
482};
483
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800484#ifdef CONFIG_NET_CLS_ROUTE
485static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
486 int length, int *eof, void *data)
487{
488 unsigned int i;
489
490 if ((offset & 3) || (length & 3))
491 return -EIO;
492
493 if (offset >= sizeof(struct ip_rt_acct) * 256) {
494 *eof = 1;
495 return 0;
496 }
497
498 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
499 length = sizeof(struct ip_rt_acct) * 256 - offset;
500 *eof = 1;
501 }
502
503 offset /= sizeof(u32);
504
505 if (length > 0) {
506 u32 *dst = (u32 *) buffer;
507
508 *start = buffer;
509 memset(dst, 0, length);
510
511 for_each_possible_cpu(i) {
512 unsigned int j;
513 u32 *src;
514
515 src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
516 for (j = 0; j < length/4; j++)
517 dst[j] += src[j];
518 }
519 }
520 return length;
521}
522#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700523#endif /* CONFIG_PROC_FS */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900524
Linus Torvalds1da177e2005-04-16 15:20:36 -0700525static __inline__ void rt_free(struct rtable *rt)
526{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700527 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
528}
529
530static __inline__ void rt_drop(struct rtable *rt)
531{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700532 ip_rt_put(rt);
533 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
534}
535
536static __inline__ int rt_fast_clean(struct rtable *rth)
537{
538 /* Kill broadcast/multicast entries very aggresively, if they
539 collide in hash table with more useful entries */
540 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800541 rth->fl.iif && rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700542}
543
544static __inline__ int rt_valuable(struct rtable *rth)
545{
546 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
547 rth->u.dst.expires;
548}
549
550static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
551{
552 unsigned long age;
553 int ret = 0;
554
555 if (atomic_read(&rth->u.dst.__refcnt))
556 goto out;
557
558 ret = 1;
559 if (rth->u.dst.expires &&
560 time_after_eq(jiffies, rth->u.dst.expires))
561 goto out;
562
563 age = jiffies - rth->u.dst.lastuse;
564 ret = 0;
565 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
566 (age <= tmo2 && rt_valuable(rth)))
567 goto out;
568 ret = 1;
569out: return ret;
570}
571
572/* Bits of score are:
573 * 31: very valuable
574 * 30: not quite useless
575 * 29..0: usage counter
576 */
577static inline u32 rt_score(struct rtable *rt)
578{
579 u32 score = jiffies - rt->u.dst.lastuse;
580
581 score = ~score & ~(3<<30);
582
583 if (rt_valuable(rt))
584 score |= (1<<31);
585
586 if (!rt->fl.iif ||
587 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
588 score |= (1<<30);
589
590 return score;
591}
592
593static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
594{
Al Viro714e85b2006-11-14 20:51:49 -0800595 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
596 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
Thomas Graf47dcf0c2006-11-09 15:20:38 -0800597 (fl1->mark ^ fl2->mark) |
David S. Miller8238b212006-10-12 00:49:15 -0700598 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
599 *(u16 *)&fl2->nl_u.ip4_u.tos) |
600 (fl1->oif ^ fl2->oif) |
601 (fl1->iif ^ fl2->iif)) == 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700602}
603
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800604/*
605 * Perform a full scan of hash table and free all entries.
606 * Can be called by a softirq or a process.
607 * In the later case, we want to be reschedule if necessary
608 */
609static void rt_do_flush(int process_context)
610{
611 unsigned int i;
612 struct rtable *rth, *next;
613
614 for (i = 0; i <= rt_hash_mask; i++) {
615 if (process_context && need_resched())
616 cond_resched();
617 rth = rt_hash_table[i].chain;
618 if (!rth)
619 continue;
620
621 spin_lock_bh(rt_hash_lock_addr(i));
622 rth = rt_hash_table[i].chain;
623 rt_hash_table[i].chain = NULL;
624 spin_unlock_bh(rt_hash_lock_addr(i));
625
626 for (; rth; rth = next) {
627 next = rth->u.dst.rt_next;
628 rt_free(rth);
629 }
630 }
631}
632
633static void rt_check_expire(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700634{
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700635 static unsigned int rover;
636 unsigned int i = rover, goal;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700637 struct rtable *rth, **rthp;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700638 u64 mult;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700639
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700640 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
641 if (ip_rt_gc_timeout > 1)
642 do_div(mult, ip_rt_gc_timeout);
643 goal = (unsigned int)mult;
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700644 if (goal > rt_hash_mask)
645 goal = rt_hash_mask + 1;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700646 for (; goal > 0; goal--) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700647 unsigned long tmo = ip_rt_gc_timeout;
648
649 i = (i + 1) & rt_hash_mask;
650 rthp = &rt_hash_table[i].chain;
651
Eric Dumazetd90bf5a2007-11-14 16:14:05 -0800652 if (need_resched())
653 cond_resched();
654
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700655 if (*rthp == NULL)
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700656 continue;
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700657 spin_lock_bh(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700658 while ((rth = *rthp) != NULL) {
659 if (rth->u.dst.expires) {
660 /* Entry is expired even if it is in use */
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700661 if (time_before_eq(jiffies, rth->u.dst.expires)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700662 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800663 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700664 continue;
665 }
666 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
667 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800668 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700669 continue;
670 }
671
672 /* Cleanup aged off entries. */
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800673 *rthp = rth->u.dst.rt_next;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900674 rt_free(rth);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700675 }
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700676 spin_unlock_bh(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700677 }
678 rover = i;
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800679}
680
681/*
682 * rt_worker_func() is run in process context.
683 * If a whole flush was scheduled, it is done.
684 * Else, we call rt_check_expire() to scan part of the hash table
685 */
686static void rt_worker_func(struct work_struct *work)
687{
688 if (ip_rt_flush_expected) {
689 ip_rt_flush_expected = 0;
690 rt_do_flush(1);
691 } else
692 rt_check_expire();
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700693 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700694}
695
696/* This can run from both BH and non-BH contexts, the latter
697 * in the case of a forced flush event.
698 */
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800699static void rt_run_flush(unsigned long process_context)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700700{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700701 rt_deadline = 0;
702
703 get_random_bytes(&rt_hash_rnd, 4);
704
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800705 rt_do_flush(process_context);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700706}
707
708static DEFINE_SPINLOCK(rt_flush_lock);
709
710void rt_cache_flush(int delay)
711{
712 unsigned long now = jiffies;
713 int user_mode = !in_softirq();
714
715 if (delay < 0)
716 delay = ip_rt_min_delay;
717
Linus Torvalds1da177e2005-04-16 15:20:36 -0700718 spin_lock_bh(&rt_flush_lock);
719
720 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
721 long tmo = (long)(rt_deadline - now);
722
723 /* If flush timer is already running
724 and flush request is not immediate (delay > 0):
725
726 if deadline is not achieved, prolongate timer to "delay",
727 otherwise fire it at deadline time.
728 */
729
730 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
731 tmo = 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900732
Linus Torvalds1da177e2005-04-16 15:20:36 -0700733 if (delay > tmo)
734 delay = tmo;
735 }
736
737 if (delay <= 0) {
738 spin_unlock_bh(&rt_flush_lock);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800739 rt_run_flush(user_mode);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700740 return;
741 }
742
743 if (rt_deadline == 0)
744 rt_deadline = now + ip_rt_max_delay;
745
746 mod_timer(&rt_flush_timer, now+delay);
747 spin_unlock_bh(&rt_flush_lock);
748}
749
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800750/*
751 * We change rt_hash_rnd and ask next rt_worker_func() invocation
752 * to perform a flush in process context
753 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700754static void rt_secret_rebuild(unsigned long dummy)
755{
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800756 get_random_bytes(&rt_hash_rnd, 4);
757 ip_rt_flush_expected = 1;
758 cancel_delayed_work(&expires_work);
759 schedule_delayed_work(&expires_work, HZ/10);
760 mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700761}
762
763/*
764 Short description of GC goals.
765
766 We want to build algorithm, which will keep routing cache
767 at some equilibrium point, when number of aged off entries
768 is kept approximately equal to newly generated ones.
769
770 Current expiration strength is variable "expire".
771 We try to adjust it dynamically, so that if networking
772 is idle expires is large enough to keep enough of warm entries,
773 and when load increases it reduces to limit cache size.
774 */
775
776static int rt_garbage_collect(void)
777{
778 static unsigned long expire = RT_GC_TIMEOUT;
779 static unsigned long last_gc;
780 static int rover;
781 static int equilibrium;
782 struct rtable *rth, **rthp;
783 unsigned long now = jiffies;
784 int goal;
785
786 /*
787 * Garbage collection is pretty expensive,
788 * do not make it too frequently.
789 */
790
791 RT_CACHE_STAT_INC(gc_total);
792
793 if (now - last_gc < ip_rt_gc_min_interval &&
794 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
795 RT_CACHE_STAT_INC(gc_ignored);
796 goto out;
797 }
798
799 /* Calculate number of entries, which we want to expire now. */
800 goal = atomic_read(&ipv4_dst_ops.entries) -
801 (ip_rt_gc_elasticity << rt_hash_log);
802 if (goal <= 0) {
803 if (equilibrium < ipv4_dst_ops.gc_thresh)
804 equilibrium = ipv4_dst_ops.gc_thresh;
805 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
806 if (goal > 0) {
807 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
808 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
809 }
810 } else {
811 /* We are in dangerous area. Try to reduce cache really
812 * aggressively.
813 */
814 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
815 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
816 }
817
818 if (now - last_gc >= ip_rt_gc_min_interval)
819 last_gc = now;
820
821 if (goal <= 0) {
822 equilibrium += goal;
823 goto work_done;
824 }
825
826 do {
827 int i, k;
828
829 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
830 unsigned long tmo = expire;
831
832 k = (k + 1) & rt_hash_mask;
833 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700834 spin_lock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700835 while ((rth = *rthp) != NULL) {
836 if (!rt_may_expire(rth, tmo, expire)) {
837 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800838 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700839 continue;
840 }
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800841 *rthp = rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700842 rt_free(rth);
843 goal--;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700844 }
Eric Dumazet22c047c2005-07-05 14:55:24 -0700845 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700846 if (goal <= 0)
847 break;
848 }
849 rover = k;
850
851 if (goal <= 0)
852 goto work_done;
853
854 /* Goal is not achieved. We stop process if:
855
856 - if expire reduced to zero. Otherwise, expire is halfed.
857 - if table is not full.
858 - if we are called from interrupt.
859 - jiffies check is just fallback/debug loop breaker.
860 We will not spin here for long time in any case.
861 */
862
863 RT_CACHE_STAT_INC(gc_goal_miss);
864
865 if (expire == 0)
866 break;
867
868 expire >>= 1;
869#if RT_CACHE_DEBUG >= 2
870 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
871 atomic_read(&ipv4_dst_ops.entries), goal, i);
872#endif
873
874 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
875 goto out;
876 } while (!in_softirq() && time_before_eq(jiffies, now));
877
878 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
879 goto out;
880 if (net_ratelimit())
881 printk(KERN_WARNING "dst cache overflow\n");
882 RT_CACHE_STAT_INC(gc_dst_overflow);
883 return 1;
884
885work_done:
886 expire += ip_rt_gc_min_interval;
887 if (expire > ip_rt_gc_timeout ||
888 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
889 expire = ip_rt_gc_timeout;
890#if RT_CACHE_DEBUG >= 2
891 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
892 atomic_read(&ipv4_dst_ops.entries), goal, rover);
893#endif
894out: return 0;
895}
896
897static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
898{
899 struct rtable *rth, **rthp;
900 unsigned long now;
901 struct rtable *cand, **candp;
902 u32 min_score;
903 int chain_length;
904 int attempts = !in_softirq();
905
906restart:
907 chain_length = 0;
908 min_score = ~(u32)0;
909 cand = NULL;
910 candp = NULL;
911 now = jiffies;
912
913 rthp = &rt_hash_table[hash].chain;
914
Eric Dumazet22c047c2005-07-05 14:55:24 -0700915 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700916 while ((rth = *rthp) != NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700917 if (compare_keys(&rth->fl, &rt->fl)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700918 /* Put it first */
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800919 *rthp = rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700920 /*
921 * Since lookup is lockfree, the deletion
922 * must be visible to another weakly ordered CPU before
923 * the insertion at the start of the hash chain.
924 */
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800925 rcu_assign_pointer(rth->u.dst.rt_next,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700926 rt_hash_table[hash].chain);
927 /*
928 * Since lookup is lockfree, the update writes
929 * must be ordered for consistency on SMP.
930 */
931 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
932
Pavel Emelyanov03f49f32007-11-10 21:28:34 -0800933 dst_use(&rth->u.dst, now);
Eric Dumazet22c047c2005-07-05 14:55:24 -0700934 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700935
936 rt_drop(rt);
937 *rp = rth;
938 return 0;
939 }
940
941 if (!atomic_read(&rth->u.dst.__refcnt)) {
942 u32 score = rt_score(rth);
943
944 if (score <= min_score) {
945 cand = rth;
946 candp = rthp;
947 min_score = score;
948 }
949 }
950
951 chain_length++;
952
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800953 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700954 }
955
956 if (cand) {
957 /* ip_rt_gc_elasticity used to be average length of chain
958 * length, when exceeded gc becomes really aggressive.
959 *
960 * The second limit is less certain. At the moment it allows
961 * only 2 entries per bucket. We will see.
962 */
963 if (chain_length > ip_rt_gc_elasticity) {
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800964 *candp = cand->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700965 rt_free(cand);
966 }
967 }
968
969 /* Try to bind route to arp only if it is output
970 route or unicast forwarding path.
971 */
972 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
973 int err = arp_bind_neighbour(&rt->u.dst);
974 if (err) {
Eric Dumazet22c047c2005-07-05 14:55:24 -0700975 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700976
977 if (err != -ENOBUFS) {
978 rt_drop(rt);
979 return err;
980 }
981
982 /* Neighbour tables are full and nothing
983 can be released. Try to shrink route cache,
984 it is most likely it holds some neighbour records.
985 */
986 if (attempts-- > 0) {
987 int saved_elasticity = ip_rt_gc_elasticity;
988 int saved_int = ip_rt_gc_min_interval;
989 ip_rt_gc_elasticity = 1;
990 ip_rt_gc_min_interval = 0;
991 rt_garbage_collect();
992 ip_rt_gc_min_interval = saved_int;
993 ip_rt_gc_elasticity = saved_elasticity;
994 goto restart;
995 }
996
997 if (net_ratelimit())
998 printk(KERN_WARNING "Neighbour table overflow.\n");
999 rt_drop(rt);
1000 return -ENOBUFS;
1001 }
1002 }
1003
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001004 rt->u.dst.rt_next = rt_hash_table[hash].chain;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001005#if RT_CACHE_DEBUG >= 2
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001006 if (rt->u.dst.rt_next) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001007 struct rtable *trt;
1008 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1009 NIPQUAD(rt->rt_dst));
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001010 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001011 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1012 printk("\n");
1013 }
1014#endif
1015 rt_hash_table[hash].chain = rt;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001016 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001017 *rp = rt;
1018 return 0;
1019}
1020
1021void rt_bind_peer(struct rtable *rt, int create)
1022{
1023 static DEFINE_SPINLOCK(rt_peer_lock);
1024 struct inet_peer *peer;
1025
1026 peer = inet_getpeer(rt->rt_dst, create);
1027
1028 spin_lock_bh(&rt_peer_lock);
1029 if (rt->peer == NULL) {
1030 rt->peer = peer;
1031 peer = NULL;
1032 }
1033 spin_unlock_bh(&rt_peer_lock);
1034 if (peer)
1035 inet_putpeer(peer);
1036}
1037
1038/*
1039 * Peer allocation may fail only in serious out-of-memory conditions. However
1040 * we still can generate some output.
1041 * Random ID selection looks a bit dangerous because we have no chances to
1042 * select ID being unique in a reasonable period of time.
1043 * But broken packet identifier may be better than no packet at all.
1044 */
1045static void ip_select_fb_ident(struct iphdr *iph)
1046{
1047 static DEFINE_SPINLOCK(ip_fb_id_lock);
1048 static u32 ip_fallback_id;
1049 u32 salt;
1050
1051 spin_lock_bh(&ip_fb_id_lock);
Al Viroe4485152006-09-26 22:15:01 -07001052 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001053 iph->id = htons(salt & 0xFFFF);
1054 ip_fallback_id = salt;
1055 spin_unlock_bh(&ip_fb_id_lock);
1056}
1057
1058void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1059{
1060 struct rtable *rt = (struct rtable *) dst;
1061
1062 if (rt) {
1063 if (rt->peer == NULL)
1064 rt_bind_peer(rt, 1);
1065
1066 /* If peer is attached to destination, it is never detached,
1067 so that we need not to grab a lock to dereference it.
1068 */
1069 if (rt->peer) {
1070 iph->id = htons(inet_getid(rt->peer, more));
1071 return;
1072 }
1073 } else
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001074 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
Stephen Hemminger9c2b3322005-04-19 22:39:42 -07001075 __builtin_return_address(0));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001076
1077 ip_select_fb_ident(iph);
1078}
1079
1080static void rt_del(unsigned hash, struct rtable *rt)
1081{
1082 struct rtable **rthp;
1083
Eric Dumazet22c047c2005-07-05 14:55:24 -07001084 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001085 ip_rt_put(rt);
1086 for (rthp = &rt_hash_table[hash].chain; *rthp;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001087 rthp = &(*rthp)->u.dst.rt_next)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001088 if (*rthp == rt) {
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001089 *rthp = rt->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001090 rt_free(rt);
1091 break;
1092 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001093 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001094}
1095
Al Virof7655222006-09-26 21:25:43 -07001096void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1097 __be32 saddr, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001098{
1099 int i, k;
1100 struct in_device *in_dev = in_dev_get(dev);
1101 struct rtable *rth, **rthp;
Al Virof7655222006-09-26 21:25:43 -07001102 __be32 skeys[2] = { saddr, 0 };
Linus Torvalds1da177e2005-04-16 15:20:36 -07001103 int ikeys[2] = { dev->ifindex, 0 };
Tom Tucker8d717402006-07-30 20:43:36 -07001104 struct netevent_redirect netevent;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001105
Linus Torvalds1da177e2005-04-16 15:20:36 -07001106 if (!in_dev)
1107 return;
1108
1109 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1110 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1111 goto reject_redirect;
1112
1113 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1114 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1115 goto reject_redirect;
1116 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1117 goto reject_redirect;
1118 } else {
1119 if (inet_addr_type(new_gw) != RTN_UNICAST)
1120 goto reject_redirect;
1121 }
1122
1123 for (i = 0; i < 2; i++) {
1124 for (k = 0; k < 2; k++) {
Al Viro8c7bc842006-09-26 21:26:19 -07001125 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001126
1127 rthp=&rt_hash_table[hash].chain;
1128
1129 rcu_read_lock();
1130 while ((rth = rcu_dereference(*rthp)) != NULL) {
1131 struct rtable *rt;
1132
1133 if (rth->fl.fl4_dst != daddr ||
1134 rth->fl.fl4_src != skeys[i] ||
Linus Torvalds1da177e2005-04-16 15:20:36 -07001135 rth->fl.oif != ikeys[k] ||
1136 rth->fl.iif != 0) {
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001137 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001138 continue;
1139 }
1140
1141 if (rth->rt_dst != daddr ||
1142 rth->rt_src != saddr ||
1143 rth->u.dst.error ||
1144 rth->rt_gateway != old_gw ||
1145 rth->u.dst.dev != dev)
1146 break;
1147
1148 dst_hold(&rth->u.dst);
1149 rcu_read_unlock();
1150
1151 rt = dst_alloc(&ipv4_dst_ops);
1152 if (rt == NULL) {
1153 ip_rt_put(rth);
1154 in_dev_put(in_dev);
1155 return;
1156 }
1157
1158 /* Copy all the information. */
1159 *rt = *rth;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001160 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001161 rt->u.dst.__use = 1;
1162 atomic_set(&rt->u.dst.__refcnt, 1);
1163 rt->u.dst.child = NULL;
1164 if (rt->u.dst.dev)
1165 dev_hold(rt->u.dst.dev);
1166 if (rt->idev)
1167 in_dev_hold(rt->idev);
1168 rt->u.dst.obsolete = 0;
1169 rt->u.dst.lastuse = jiffies;
1170 rt->u.dst.path = &rt->u.dst;
1171 rt->u.dst.neighbour = NULL;
1172 rt->u.dst.hh = NULL;
1173 rt->u.dst.xfrm = NULL;
1174
1175 rt->rt_flags |= RTCF_REDIRECTED;
1176
1177 /* Gateway is different ... */
1178 rt->rt_gateway = new_gw;
1179
1180 /* Redirect received -> path was valid */
1181 dst_confirm(&rth->u.dst);
1182
1183 if (rt->peer)
1184 atomic_inc(&rt->peer->refcnt);
1185
1186 if (arp_bind_neighbour(&rt->u.dst) ||
1187 !(rt->u.dst.neighbour->nud_state &
1188 NUD_VALID)) {
1189 if (rt->u.dst.neighbour)
1190 neigh_event_send(rt->u.dst.neighbour, NULL);
1191 ip_rt_put(rth);
1192 rt_drop(rt);
1193 goto do_next;
1194 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001195
Tom Tucker8d717402006-07-30 20:43:36 -07001196 netevent.old = &rth->u.dst;
1197 netevent.new = &rt->u.dst;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001198 call_netevent_notifiers(NETEVENT_REDIRECT,
1199 &netevent);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001200
1201 rt_del(hash, rth);
1202 if (!rt_intern_hash(hash, rt, &rt))
1203 ip_rt_put(rt);
1204 goto do_next;
1205 }
1206 rcu_read_unlock();
1207 do_next:
1208 ;
1209 }
1210 }
1211 in_dev_put(in_dev);
1212 return;
1213
1214reject_redirect:
1215#ifdef CONFIG_IP_ROUTE_VERBOSE
1216 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1217 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1218 "%u.%u.%u.%u ignored.\n"
Ilia Sotnikovcef26852006-03-25 01:38:55 -08001219 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07001220 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
Ilia Sotnikovcef26852006-03-25 01:38:55 -08001221 NIPQUAD(saddr), NIPQUAD(daddr));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001222#endif
1223 in_dev_put(in_dev);
1224}
1225
1226static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1227{
1228 struct rtable *rt = (struct rtable*)dst;
1229 struct dst_entry *ret = dst;
1230
1231 if (rt) {
1232 if (dst->obsolete) {
1233 ip_rt_put(rt);
1234 ret = NULL;
1235 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1236 rt->u.dst.expires) {
Al Viro8c7bc842006-09-26 21:26:19 -07001237 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1238 rt->fl.oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001239#if RT_CACHE_DEBUG >= 1
Denis V. Lunev56c99d02007-12-06 02:19:07 -08001240 printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
Linus Torvalds1da177e2005-04-16 15:20:36 -07001241 "%u.%u.%u.%u/%02x dropped\n",
1242 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1243#endif
1244 rt_del(hash, rt);
1245 ret = NULL;
1246 }
1247 }
1248 return ret;
1249}
1250
1251/*
1252 * Algorithm:
1253 * 1. The first ip_rt_redirect_number redirects are sent
1254 * with exponential backoff, then we stop sending them at all,
1255 * assuming that the host ignores our redirects.
1256 * 2. If we did not see packets requiring redirects
1257 * during ip_rt_redirect_silence, we assume that the host
1258 * forgot redirected route and start to send redirects again.
1259 *
1260 * This algorithm is much cheaper and more intelligent than dumb load limiting
1261 * in icmp.c.
1262 *
1263 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1264 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1265 */
1266
1267void ip_rt_send_redirect(struct sk_buff *skb)
1268{
1269 struct rtable *rt = (struct rtable*)skb->dst;
1270 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1271
1272 if (!in_dev)
1273 return;
1274
1275 if (!IN_DEV_TX_REDIRECTS(in_dev))
1276 goto out;
1277
1278 /* No redirected packets during ip_rt_redirect_silence;
1279 * reset the algorithm.
1280 */
1281 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1282 rt->u.dst.rate_tokens = 0;
1283
1284 /* Too many ignored redirects; do not send anything
1285 * set u.dst.rate_last to the last seen redirected packet.
1286 */
1287 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1288 rt->u.dst.rate_last = jiffies;
1289 goto out;
1290 }
1291
1292 /* Check for load limit; set rate_last to the latest sent
1293 * redirect.
1294 */
Li Yewang14fb8a72006-12-18 00:26:35 -08001295 if (rt->u.dst.rate_tokens == 0 ||
1296 time_after(jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001297 (rt->u.dst.rate_last +
1298 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1299 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1300 rt->u.dst.rate_last = jiffies;
1301 ++rt->u.dst.rate_tokens;
1302#ifdef CONFIG_IP_ROUTE_VERBOSE
1303 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1304 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1305 net_ratelimit())
1306 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1307 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1308 NIPQUAD(rt->rt_src), rt->rt_iif,
1309 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1310#endif
1311 }
1312out:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001313 in_dev_put(in_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001314}
1315
1316static int ip_error(struct sk_buff *skb)
1317{
1318 struct rtable *rt = (struct rtable*)skb->dst;
1319 unsigned long now;
1320 int code;
1321
1322 switch (rt->u.dst.error) {
1323 case EINVAL:
1324 default:
1325 goto out;
1326 case EHOSTUNREACH:
1327 code = ICMP_HOST_UNREACH;
1328 break;
1329 case ENETUNREACH:
1330 code = ICMP_NET_UNREACH;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08001331 IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001332 break;
1333 case EACCES:
1334 code = ICMP_PKT_FILTERED;
1335 break;
1336 }
1337
1338 now = jiffies;
1339 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1340 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1341 rt->u.dst.rate_tokens = ip_rt_error_burst;
1342 rt->u.dst.rate_last = now;
1343 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1344 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1345 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1346 }
1347
1348out: kfree_skb(skb);
1349 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001350}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001351
1352/*
1353 * The last two values are not from the RFC but
1354 * are needed for AMPRnet AX.25 paths.
1355 */
1356
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001357static const unsigned short mtu_plateau[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -07001358{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1359
1360static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1361{
1362 int i;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001363
Linus Torvalds1da177e2005-04-16 15:20:36 -07001364 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1365 if (old_mtu > mtu_plateau[i])
1366 return mtu_plateau[i];
1367 return 68;
1368}
1369
1370unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1371{
1372 int i;
1373 unsigned short old_mtu = ntohs(iph->tot_len);
1374 struct rtable *rth;
Al Viroe4485152006-09-26 22:15:01 -07001375 __be32 skeys[2] = { iph->saddr, 0, };
1376 __be32 daddr = iph->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001377 unsigned short est_mtu = 0;
1378
1379 if (ipv4_config.no_pmtu_disc)
1380 return 0;
1381
1382 for (i = 0; i < 2; i++) {
Al Viro8c7bc842006-09-26 21:26:19 -07001383 unsigned hash = rt_hash(daddr, skeys[i], 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001384
1385 rcu_read_lock();
1386 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001387 rth = rcu_dereference(rth->u.dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001388 if (rth->fl.fl4_dst == daddr &&
1389 rth->fl.fl4_src == skeys[i] &&
1390 rth->rt_dst == daddr &&
1391 rth->rt_src == iph->saddr &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001392 rth->fl.iif == 0 &&
1393 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1394 unsigned short mtu = new_mtu;
1395
1396 if (new_mtu < 68 || new_mtu >= old_mtu) {
1397
1398 /* BSD 4.2 compatibility hack :-( */
1399 if (mtu == 0 &&
1400 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1401 old_mtu >= 68 + (iph->ihl << 2))
1402 old_mtu -= iph->ihl << 2;
1403
1404 mtu = guess_mtu(old_mtu);
1405 }
1406 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001407 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001408 dst_confirm(&rth->u.dst);
1409 if (mtu < ip_rt_min_pmtu) {
1410 mtu = ip_rt_min_pmtu;
1411 rth->u.dst.metrics[RTAX_LOCK-1] |=
1412 (1 << RTAX_MTU);
1413 }
1414 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1415 dst_set_expires(&rth->u.dst,
1416 ip_rt_mtu_expires);
1417 }
1418 est_mtu = mtu;
1419 }
1420 }
1421 }
1422 rcu_read_unlock();
1423 }
1424 return est_mtu ? : new_mtu;
1425}
1426
1427static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1428{
1429 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1430 !(dst_metric_locked(dst, RTAX_MTU))) {
1431 if (mtu < ip_rt_min_pmtu) {
1432 mtu = ip_rt_min_pmtu;
1433 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1434 }
1435 dst->metrics[RTAX_MTU-1] = mtu;
1436 dst_set_expires(dst, ip_rt_mtu_expires);
Tom Tucker8d717402006-07-30 20:43:36 -07001437 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001438 }
1439}
1440
1441static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1442{
1443 return NULL;
1444}
1445
1446static void ipv4_dst_destroy(struct dst_entry *dst)
1447{
1448 struct rtable *rt = (struct rtable *) dst;
1449 struct inet_peer *peer = rt->peer;
1450 struct in_device *idev = rt->idev;
1451
1452 if (peer) {
1453 rt->peer = NULL;
1454 inet_putpeer(peer);
1455 }
1456
1457 if (idev) {
1458 rt->idev = NULL;
1459 in_dev_put(idev);
1460 }
1461}
1462
1463static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1464 int how)
1465{
1466 struct rtable *rt = (struct rtable *) dst;
1467 struct in_device *idev = rt->idev;
Eric W. Biederman2774c7a2007-09-26 22:10:56 -07001468 if (dev != init_net.loopback_dev && idev && idev->dev == dev) {
1469 struct in_device *loopback_idev = in_dev_get(init_net.loopback_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001470 if (loopback_idev) {
1471 rt->idev = loopback_idev;
1472 in_dev_put(idev);
1473 }
1474 }
1475}
1476
1477static void ipv4_link_failure(struct sk_buff *skb)
1478{
1479 struct rtable *rt;
1480
1481 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1482
1483 rt = (struct rtable *) skb->dst;
1484 if (rt)
1485 dst_set_expires(&rt->u.dst, 0);
1486}
1487
1488static int ip_rt_bug(struct sk_buff *skb)
1489{
1490 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001491 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
Linus Torvalds1da177e2005-04-16 15:20:36 -07001492 skb->dev ? skb->dev->name : "?");
1493 kfree_skb(skb);
1494 return 0;
1495}
1496
1497/*
1498 We do not cache source address of outgoing interface,
1499 because it is used only by IP RR, TS and SRR options,
1500 so that it out of fast path.
1501
1502 BTW remember: "addr" is allowed to be not aligned
1503 in IP options!
1504 */
1505
1506void ip_rt_get_source(u8 *addr, struct rtable *rt)
1507{
Al Viroa61ced52006-09-26 21:27:54 -07001508 __be32 src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001509 struct fib_result res;
1510
1511 if (rt->fl.iif == 0)
1512 src = rt->rt_src;
1513 else if (fib_lookup(&rt->fl, &res) == 0) {
1514 src = FIB_RES_PREFSRC(res);
1515 fib_res_put(&res);
1516 } else
1517 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1518 RT_SCOPE_UNIVERSE);
1519 memcpy(addr, &src, 4);
1520}
1521
1522#ifdef CONFIG_NET_CLS_ROUTE
1523static void set_class_tag(struct rtable *rt, u32 tag)
1524{
1525 if (!(rt->u.dst.tclassid & 0xFFFF))
1526 rt->u.dst.tclassid |= tag & 0xFFFF;
1527 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1528 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1529}
1530#endif
1531
1532static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1533{
1534 struct fib_info *fi = res->fi;
1535
1536 if (fi) {
1537 if (FIB_RES_GW(*res) &&
1538 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1539 rt->rt_gateway = FIB_RES_GW(*res);
1540 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1541 sizeof(rt->u.dst.metrics));
1542 if (fi->fib_mtu == 0) {
1543 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1544 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1545 rt->rt_gateway != rt->rt_dst &&
1546 rt->u.dst.dev->mtu > 576)
1547 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1548 }
1549#ifdef CONFIG_NET_CLS_ROUTE
1550 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1551#endif
1552 } else
1553 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1554
1555 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1556 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1557 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1558 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1559 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1560 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1561 ip_rt_min_advmss);
1562 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1563 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1564
1565#ifdef CONFIG_NET_CLS_ROUTE
1566#ifdef CONFIG_IP_MULTIPLE_TABLES
1567 set_class_tag(rt, fib_rules_tclass(res));
1568#endif
1569 set_class_tag(rt, itag);
1570#endif
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001571 rt->rt_type = res->type;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001572}
1573
Al Viro9e12bb22006-09-26 21:25:20 -07001574static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001575 u8 tos, struct net_device *dev, int our)
1576{
1577 unsigned hash;
1578 struct rtable *rth;
Al Viroa61ced52006-09-26 21:27:54 -07001579 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001580 struct in_device *in_dev = in_dev_get(dev);
1581 u32 itag = 0;
1582
1583 /* Primary sanity checks. */
1584
1585 if (in_dev == NULL)
1586 return -EINVAL;
1587
1588 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1589 skb->protocol != htons(ETH_P_IP))
1590 goto e_inval;
1591
1592 if (ZERONET(saddr)) {
1593 if (!LOCAL_MCAST(daddr))
1594 goto e_inval;
1595 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1596 } else if (fib_validate_source(saddr, 0, tos, 0,
1597 dev, &spec_dst, &itag) < 0)
1598 goto e_inval;
1599
1600 rth = dst_alloc(&ipv4_dst_ops);
1601 if (!rth)
1602 goto e_nobufs;
1603
1604 rth->u.dst.output= ip_rt_bug;
1605
1606 atomic_set(&rth->u.dst.__refcnt, 1);
1607 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07001608 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001609 rth->u.dst.flags |= DST_NOPOLICY;
1610 rth->fl.fl4_dst = daddr;
1611 rth->rt_dst = daddr;
1612 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001613 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001614 rth->fl.fl4_src = saddr;
1615 rth->rt_src = saddr;
1616#ifdef CONFIG_NET_CLS_ROUTE
1617 rth->u.dst.tclassid = itag;
1618#endif
1619 rth->rt_iif =
1620 rth->fl.iif = dev->ifindex;
Eric W. Biederman2774c7a2007-09-26 22:10:56 -07001621 rth->u.dst.dev = init_net.loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001622 dev_hold(rth->u.dst.dev);
1623 rth->idev = in_dev_get(rth->u.dst.dev);
1624 rth->fl.oif = 0;
1625 rth->rt_gateway = daddr;
1626 rth->rt_spec_dst= spec_dst;
1627 rth->rt_type = RTN_MULTICAST;
1628 rth->rt_flags = RTCF_MULTICAST;
1629 if (our) {
1630 rth->u.dst.input= ip_local_deliver;
1631 rth->rt_flags |= RTCF_LOCAL;
1632 }
1633
1634#ifdef CONFIG_IP_MROUTE
1635 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1636 rth->u.dst.input = ip_mr_input;
1637#endif
1638 RT_CACHE_STAT_INC(in_slow_mc);
1639
1640 in_dev_put(in_dev);
Al Viro8c7bc842006-09-26 21:26:19 -07001641 hash = rt_hash(daddr, saddr, dev->ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001642 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1643
1644e_nobufs:
1645 in_dev_put(in_dev);
1646 return -ENOBUFS;
1647
1648e_inval:
1649 in_dev_put(in_dev);
1650 return -EINVAL;
1651}
1652
1653
1654static void ip_handle_martian_source(struct net_device *dev,
1655 struct in_device *in_dev,
1656 struct sk_buff *skb,
Al Viro9e12bb22006-09-26 21:25:20 -07001657 __be32 daddr,
1658 __be32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001659{
1660 RT_CACHE_STAT_INC(in_martian_src);
1661#ifdef CONFIG_IP_ROUTE_VERBOSE
1662 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1663 /*
1664 * RFC1812 recommendation, if source is martian,
1665 * the only hint is MAC header.
1666 */
1667 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1668 "%u.%u.%u.%u, on dev %s\n",
1669 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001670 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001671 int i;
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001672 const unsigned char *p = skb_mac_header(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001673 printk(KERN_WARNING "ll header: ");
1674 for (i = 0; i < dev->hard_header_len; i++, p++) {
1675 printk("%02x", *p);
1676 if (i < (dev->hard_header_len - 1))
1677 printk(":");
1678 }
1679 printk("\n");
1680 }
1681 }
1682#endif
1683}
1684
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001685static inline int __mkroute_input(struct sk_buff *skb,
1686 struct fib_result* res,
1687 struct in_device *in_dev,
Al Viro9e12bb22006-09-26 21:25:20 -07001688 __be32 daddr, __be32 saddr, u32 tos,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001689 struct rtable **result)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001690{
1691
1692 struct rtable *rth;
1693 int err;
1694 struct in_device *out_dev;
1695 unsigned flags = 0;
Al Virod9c9df82006-09-26 21:28:14 -07001696 __be32 spec_dst;
1697 u32 itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001698
1699 /* get a working reference to the output device */
1700 out_dev = in_dev_get(FIB_RES_DEV(*res));
1701 if (out_dev == NULL) {
1702 if (net_ratelimit())
1703 printk(KERN_CRIT "Bug in ip_route_input" \
1704 "_slow(). Please, report\n");
1705 return -EINVAL;
1706 }
1707
1708
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001709 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
Linus Torvalds1da177e2005-04-16 15:20:36 -07001710 in_dev->dev, &spec_dst, &itag);
1711 if (err < 0) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001712 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001713 saddr);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001714
Linus Torvalds1da177e2005-04-16 15:20:36 -07001715 err = -EINVAL;
1716 goto cleanup;
1717 }
1718
1719 if (err)
1720 flags |= RTCF_DIRECTSRC;
1721
1722 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1723 (IN_DEV_SHARED_MEDIA(out_dev) ||
1724 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1725 flags |= RTCF_DOREDIRECT;
1726
1727 if (skb->protocol != htons(ETH_P_IP)) {
1728 /* Not IP (i.e. ARP). Do not create route, if it is
1729 * invalid for proxy arp. DNAT routes are always valid.
1730 */
1731 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1732 err = -EINVAL;
1733 goto cleanup;
1734 }
1735 }
1736
1737
1738 rth = dst_alloc(&ipv4_dst_ops);
1739 if (!rth) {
1740 err = -ENOBUFS;
1741 goto cleanup;
1742 }
1743
Julian Anastasovce723d82005-09-08 13:34:47 -07001744 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001745 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07001746 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001747 rth->u.dst.flags |= DST_NOPOLICY;
Herbert Xu42f811b2007-06-04 23:34:44 -07001748 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001749 rth->u.dst.flags |= DST_NOXFRM;
1750 rth->fl.fl4_dst = daddr;
1751 rth->rt_dst = daddr;
1752 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001753 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001754 rth->fl.fl4_src = saddr;
1755 rth->rt_src = saddr;
1756 rth->rt_gateway = daddr;
1757 rth->rt_iif =
1758 rth->fl.iif = in_dev->dev->ifindex;
1759 rth->u.dst.dev = (out_dev)->dev;
1760 dev_hold(rth->u.dst.dev);
1761 rth->idev = in_dev_get(rth->u.dst.dev);
1762 rth->fl.oif = 0;
1763 rth->rt_spec_dst= spec_dst;
1764
1765 rth->u.dst.input = ip_forward;
1766 rth->u.dst.output = ip_output;
1767
1768 rt_set_nexthop(rth, res, itag);
1769
1770 rth->rt_flags = flags;
1771
1772 *result = rth;
1773 err = 0;
1774 cleanup:
1775 /* release the working reference to the output device */
1776 in_dev_put(out_dev);
1777 return err;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001778}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001779
David S. Millere06e7c62007-06-10 17:22:39 -07001780static inline int ip_mkroute_input(struct sk_buff *skb,
1781 struct fib_result* res,
1782 const struct flowi *fl,
1783 struct in_device *in_dev,
1784 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001785{
Chuck Short7abaa272005-06-22 22:10:23 -07001786 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001787 int err;
1788 unsigned hash;
1789
1790#ifdef CONFIG_IP_ROUTE_MULTIPATH
1791 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1792 fib_select_multipath(fl, res);
1793#endif
1794
1795 /* create a routing cache entry */
1796 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1797 if (err)
1798 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001799
1800 /* put it into the cache */
Al Viro8c7bc842006-09-26 21:26:19 -07001801 hash = rt_hash(daddr, saddr, fl->iif);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001802 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001803}
1804
Linus Torvalds1da177e2005-04-16 15:20:36 -07001805/*
1806 * NOTE. We drop all the packets that has local source
1807 * addresses, because every properly looped back packet
1808 * must have correct destination already attached by output routine.
1809 *
1810 * Such approach solves two big problems:
1811 * 1. Not simplex devices are handled properly.
1812 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1813 */
1814
Al Viro9e12bb22006-09-26 21:25:20 -07001815static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001816 u8 tos, struct net_device *dev)
1817{
1818 struct fib_result res;
1819 struct in_device *in_dev = in_dev_get(dev);
1820 struct flowi fl = { .nl_u = { .ip4_u =
1821 { .daddr = daddr,
1822 .saddr = saddr,
1823 .tos = tos,
1824 .scope = RT_SCOPE_UNIVERSE,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001825 } },
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001826 .mark = skb->mark,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001827 .iif = dev->ifindex };
1828 unsigned flags = 0;
1829 u32 itag = 0;
1830 struct rtable * rth;
1831 unsigned hash;
Al Viro9e12bb22006-09-26 21:25:20 -07001832 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001833 int err = -EINVAL;
1834 int free_res = 0;
1835
1836 /* IP on this device is disabled. */
1837
1838 if (!in_dev)
1839 goto out;
1840
1841 /* Check for the most weird martians, which can be not detected
1842 by fib_lookup.
1843 */
1844
1845 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1846 goto martian_source;
1847
Al Viroe4485152006-09-26 22:15:01 -07001848 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001849 goto brd_input;
1850
1851 /* Accept zero addresses only to limited broadcast;
1852 * I even do not know to fix it or not. Waiting for complains :-)
1853 */
1854 if (ZERONET(saddr))
1855 goto martian_source;
1856
1857 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1858 goto martian_destination;
1859
1860 /*
1861 * Now we are ready to route packet.
1862 */
1863 if ((err = fib_lookup(&fl, &res)) != 0) {
1864 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001865 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001866 goto no_route;
1867 }
1868 free_res = 1;
1869
1870 RT_CACHE_STAT_INC(in_slow_tot);
1871
1872 if (res.type == RTN_BROADCAST)
1873 goto brd_input;
1874
1875 if (res.type == RTN_LOCAL) {
1876 int result;
1877 result = fib_validate_source(saddr, daddr, tos,
Eric W. Biederman2774c7a2007-09-26 22:10:56 -07001878 init_net.loopback_dev->ifindex,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001879 dev, &spec_dst, &itag);
1880 if (result < 0)
1881 goto martian_source;
1882 if (result)
1883 flags |= RTCF_DIRECTSRC;
1884 spec_dst = daddr;
1885 goto local_input;
1886 }
1887
1888 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001889 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001890 if (res.type != RTN_UNICAST)
1891 goto martian_destination;
1892
1893 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001894done:
1895 in_dev_put(in_dev);
1896 if (free_res)
1897 fib_res_put(&res);
1898out: return err;
1899
1900brd_input:
1901 if (skb->protocol != htons(ETH_P_IP))
1902 goto e_inval;
1903
1904 if (ZERONET(saddr))
1905 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1906 else {
1907 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1908 &itag);
1909 if (err < 0)
1910 goto martian_source;
1911 if (err)
1912 flags |= RTCF_DIRECTSRC;
1913 }
1914 flags |= RTCF_BROADCAST;
1915 res.type = RTN_BROADCAST;
1916 RT_CACHE_STAT_INC(in_brd);
1917
1918local_input:
1919 rth = dst_alloc(&ipv4_dst_ops);
1920 if (!rth)
1921 goto e_nobufs;
1922
1923 rth->u.dst.output= ip_rt_bug;
1924
1925 atomic_set(&rth->u.dst.__refcnt, 1);
1926 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07001927 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001928 rth->u.dst.flags |= DST_NOPOLICY;
1929 rth->fl.fl4_dst = daddr;
1930 rth->rt_dst = daddr;
1931 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001932 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001933 rth->fl.fl4_src = saddr;
1934 rth->rt_src = saddr;
1935#ifdef CONFIG_NET_CLS_ROUTE
1936 rth->u.dst.tclassid = itag;
1937#endif
1938 rth->rt_iif =
1939 rth->fl.iif = dev->ifindex;
Eric W. Biederman2774c7a2007-09-26 22:10:56 -07001940 rth->u.dst.dev = init_net.loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001941 dev_hold(rth->u.dst.dev);
1942 rth->idev = in_dev_get(rth->u.dst.dev);
1943 rth->rt_gateway = daddr;
1944 rth->rt_spec_dst= spec_dst;
1945 rth->u.dst.input= ip_local_deliver;
1946 rth->rt_flags = flags|RTCF_LOCAL;
1947 if (res.type == RTN_UNREACHABLE) {
1948 rth->u.dst.input= ip_error;
1949 rth->u.dst.error= -err;
1950 rth->rt_flags &= ~RTCF_LOCAL;
1951 }
1952 rth->rt_type = res.type;
Al Viro8c7bc842006-09-26 21:26:19 -07001953 hash = rt_hash(daddr, saddr, fl.iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001954 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1955 goto done;
1956
1957no_route:
1958 RT_CACHE_STAT_INC(in_no_route);
1959 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1960 res.type = RTN_UNREACHABLE;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08001961 if (err == -ESRCH)
1962 err = -ENETUNREACH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001963 goto local_input;
1964
1965 /*
1966 * Do not cache martian addresses: they should be logged (RFC1812)
1967 */
1968martian_destination:
1969 RT_CACHE_STAT_INC(in_martian_dst);
1970#ifdef CONFIG_IP_ROUTE_VERBOSE
1971 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1972 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1973 "%u.%u.%u.%u, dev %s\n",
1974 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1975#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001976
1977e_hostunreach:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001978 err = -EHOSTUNREACH;
1979 goto done;
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001980
Linus Torvalds1da177e2005-04-16 15:20:36 -07001981e_inval:
1982 err = -EINVAL;
1983 goto done;
1984
1985e_nobufs:
1986 err = -ENOBUFS;
1987 goto done;
1988
1989martian_source:
1990 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1991 goto e_inval;
1992}
1993
Al Viro9e12bb22006-09-26 21:25:20 -07001994int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001995 u8 tos, struct net_device *dev)
1996{
1997 struct rtable * rth;
1998 unsigned hash;
1999 int iif = dev->ifindex;
2000
2001 tos &= IPTOS_RT_MASK;
Al Viro8c7bc842006-09-26 21:26:19 -07002002 hash = rt_hash(daddr, saddr, iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002003
2004 rcu_read_lock();
2005 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08002006 rth = rcu_dereference(rth->u.dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002007 if (rth->fl.fl4_dst == daddr &&
2008 rth->fl.fl4_src == saddr &&
2009 rth->fl.iif == iif &&
2010 rth->fl.oif == 0 &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002011 rth->fl.mark == skb->mark &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002012 rth->fl.fl4_tos == tos) {
Pavel Emelyanov03f49f32007-11-10 21:28:34 -08002013 dst_use(&rth->u.dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002014 RT_CACHE_STAT_INC(in_hit);
2015 rcu_read_unlock();
2016 skb->dst = (struct dst_entry*)rth;
2017 return 0;
2018 }
2019 RT_CACHE_STAT_INC(in_hlist_search);
2020 }
2021 rcu_read_unlock();
2022
2023 /* Multicast recognition logic is moved from route cache to here.
2024 The problem was that too many Ethernet cards have broken/missing
2025 hardware multicast filters :-( As result the host on multicasting
2026 network acquires a lot of useless route cache entries, sort of
2027 SDR messages from all the world. Now we try to get rid of them.
2028 Really, provided software IP multicast filter is organized
2029 reasonably (at least, hashed), it does not result in a slowdown
2030 comparing with route cache reject entries.
2031 Note, that multicast routers are not affected, because
2032 route cache entry is created eventually.
2033 */
2034 if (MULTICAST(daddr)) {
2035 struct in_device *in_dev;
2036
2037 rcu_read_lock();
Herbert Xue5ed6392005-10-03 14:35:55 -07002038 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002039 int our = ip_check_mc(in_dev, daddr, saddr,
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07002040 ip_hdr(skb)->protocol);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002041 if (our
2042#ifdef CONFIG_IP_MROUTE
2043 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2044#endif
2045 ) {
2046 rcu_read_unlock();
2047 return ip_route_input_mc(skb, daddr, saddr,
2048 tos, dev, our);
2049 }
2050 }
2051 rcu_read_unlock();
2052 return -EINVAL;
2053 }
2054 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2055}
2056
2057static inline int __mkroute_output(struct rtable **result,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002058 struct fib_result* res,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002059 const struct flowi *fl,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002060 const struct flowi *oldflp,
2061 struct net_device *dev_out,
2062 unsigned flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002063{
2064 struct rtable *rth;
2065 struct in_device *in_dev;
2066 u32 tos = RT_FL_TOS(oldflp);
2067 int err = 0;
2068
2069 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2070 return -EINVAL;
2071
Al Viroe4485152006-09-26 22:15:01 -07002072 if (fl->fl4_dst == htonl(0xFFFFFFFF))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002073 res->type = RTN_BROADCAST;
2074 else if (MULTICAST(fl->fl4_dst))
2075 res->type = RTN_MULTICAST;
2076 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2077 return -EINVAL;
2078
2079 if (dev_out->flags & IFF_LOOPBACK)
2080 flags |= RTCF_LOCAL;
2081
2082 /* get work reference to inet device */
2083 in_dev = in_dev_get(dev_out);
2084 if (!in_dev)
2085 return -EINVAL;
2086
2087 if (res->type == RTN_BROADCAST) {
2088 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2089 if (res->fi) {
2090 fib_info_put(res->fi);
2091 res->fi = NULL;
2092 }
2093 } else if (res->type == RTN_MULTICAST) {
2094 flags |= RTCF_MULTICAST|RTCF_LOCAL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002095 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002096 oldflp->proto))
2097 flags &= ~RTCF_LOCAL;
2098 /* If multicast route do not exist use
2099 default one, but do not gateway in this case.
2100 Yes, it is hack.
2101 */
2102 if (res->fi && res->prefixlen < 4) {
2103 fib_info_put(res->fi);
2104 res->fi = NULL;
2105 }
2106 }
2107
2108
2109 rth = dst_alloc(&ipv4_dst_ops);
2110 if (!rth) {
2111 err = -ENOBUFS;
2112 goto cleanup;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002113 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002114
Julian Anastasovce723d82005-09-08 13:34:47 -07002115 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002116 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07002117 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002118 rth->u.dst.flags |= DST_NOXFRM;
Herbert Xu42f811b2007-06-04 23:34:44 -07002119 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002120 rth->u.dst.flags |= DST_NOPOLICY;
2121
2122 rth->fl.fl4_dst = oldflp->fl4_dst;
2123 rth->fl.fl4_tos = tos;
2124 rth->fl.fl4_src = oldflp->fl4_src;
2125 rth->fl.oif = oldflp->oif;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002126 rth->fl.mark = oldflp->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002127 rth->rt_dst = fl->fl4_dst;
2128 rth->rt_src = fl->fl4_src;
2129 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002130 /* get references to the devices that are to be hold by the routing
Linus Torvalds1da177e2005-04-16 15:20:36 -07002131 cache entry */
2132 rth->u.dst.dev = dev_out;
2133 dev_hold(dev_out);
2134 rth->idev = in_dev_get(dev_out);
2135 rth->rt_gateway = fl->fl4_dst;
2136 rth->rt_spec_dst= fl->fl4_src;
2137
2138 rth->u.dst.output=ip_output;
2139
2140 RT_CACHE_STAT_INC(out_slow_tot);
2141
2142 if (flags & RTCF_LOCAL) {
2143 rth->u.dst.input = ip_local_deliver;
2144 rth->rt_spec_dst = fl->fl4_dst;
2145 }
2146 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2147 rth->rt_spec_dst = fl->fl4_src;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002148 if (flags & RTCF_LOCAL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002149 !(dev_out->flags & IFF_LOOPBACK)) {
2150 rth->u.dst.output = ip_mc_output;
2151 RT_CACHE_STAT_INC(out_slow_mc);
2152 }
2153#ifdef CONFIG_IP_MROUTE
2154 if (res->type == RTN_MULTICAST) {
2155 if (IN_DEV_MFORWARD(in_dev) &&
2156 !LOCAL_MCAST(oldflp->fl4_dst)) {
2157 rth->u.dst.input = ip_mr_input;
2158 rth->u.dst.output = ip_mc_output;
2159 }
2160 }
2161#endif
2162 }
2163
2164 rt_set_nexthop(rth, res, 0);
2165
2166 rth->rt_flags = flags;
2167
2168 *result = rth;
2169 cleanup:
2170 /* release work reference to inet device */
2171 in_dev_put(in_dev);
2172
2173 return err;
2174}
2175
David S. Millere06e7c62007-06-10 17:22:39 -07002176static inline int ip_mkroute_output(struct rtable **rp,
2177 struct fib_result* res,
2178 const struct flowi *fl,
2179 const struct flowi *oldflp,
2180 struct net_device *dev_out,
2181 unsigned flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002182{
Chuck Short7abaa272005-06-22 22:10:23 -07002183 struct rtable *rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002184 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2185 unsigned hash;
2186 if (err == 0) {
Al Viro8c7bc842006-09-26 21:26:19 -07002187 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002188 err = rt_intern_hash(hash, rth, rp);
2189 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002190
Linus Torvalds1da177e2005-04-16 15:20:36 -07002191 return err;
2192}
2193
Linus Torvalds1da177e2005-04-16 15:20:36 -07002194/*
2195 * Major route resolver routine.
2196 */
2197
2198static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2199{
2200 u32 tos = RT_FL_TOS(oldflp);
2201 struct flowi fl = { .nl_u = { .ip4_u =
2202 { .daddr = oldflp->fl4_dst,
2203 .saddr = oldflp->fl4_src,
2204 .tos = tos & IPTOS_RT_MASK,
2205 .scope = ((tos & RTO_ONLINK) ?
2206 RT_SCOPE_LINK :
2207 RT_SCOPE_UNIVERSE),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002208 } },
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002209 .mark = oldflp->mark,
Eric W. Biederman2774c7a2007-09-26 22:10:56 -07002210 .iif = init_net.loopback_dev->ifindex,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002211 .oif = oldflp->oif };
2212 struct fib_result res;
2213 unsigned flags = 0;
2214 struct net_device *dev_out = NULL;
2215 int free_res = 0;
2216 int err;
2217
2218
2219 res.fi = NULL;
2220#ifdef CONFIG_IP_MULTIPLE_TABLES
2221 res.r = NULL;
2222#endif
2223
2224 if (oldflp->fl4_src) {
2225 err = -EINVAL;
2226 if (MULTICAST(oldflp->fl4_src) ||
2227 BADCLASS(oldflp->fl4_src) ||
2228 ZERONET(oldflp->fl4_src))
2229 goto out;
2230
2231 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2232 dev_out = ip_dev_find(oldflp->fl4_src);
David S. Millerf6c5d732007-05-18 02:07:50 -07002233 if (dev_out == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002234 goto out;
2235
2236 /* I removed check for oif == dev_out->oif here.
2237 It was wrong for two reasons:
2238 1. ip_dev_find(saddr) can return wrong iface, if saddr is
2239 assigned to multiple interfaces.
2240 2. Moreover, we are allowed to send packets with saddr
2241 of another iface. --ANK
2242 */
2243
David S. Millerf6c5d732007-05-18 02:07:50 -07002244 if (oldflp->oif == 0
Al Viroe4485152006-09-26 22:15:01 -07002245 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002246 /* Special hack: user can direct multicasts
2247 and limited broadcast via necessary interface
2248 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2249 This hack is not just for fun, it allows
2250 vic,vat and friends to work.
2251 They bind socket to loopback, set ttl to zero
2252 and expect that it will work.
2253 From the viewpoint of routing cache they are broken,
2254 because we are not allowed to build multicast path
2255 with loopback source addr (look, routing cache
2256 cannot know, that ttl is zero, so that packet
2257 will not leave this host and route is valid).
2258 Luckily, this hack is good workaround.
2259 */
2260
2261 fl.oif = dev_out->ifindex;
2262 goto make_route;
2263 }
2264 if (dev_out)
2265 dev_put(dev_out);
2266 dev_out = NULL;
2267 }
2268
2269
2270 if (oldflp->oif) {
Eric W. Biederman881d9662007-09-17 11:56:21 -07002271 dev_out = dev_get_by_index(&init_net, oldflp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002272 err = -ENODEV;
2273 if (dev_out == NULL)
2274 goto out;
Herbert Xue5ed6392005-10-03 14:35:55 -07002275
2276 /* RACE: Check return value of inet_select_addr instead. */
2277 if (__in_dev_get_rtnl(dev_out) == NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002278 dev_put(dev_out);
2279 goto out; /* Wrong error code */
2280 }
2281
Al Viroe4485152006-09-26 22:15:01 -07002282 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002283 if (!fl.fl4_src)
2284 fl.fl4_src = inet_select_addr(dev_out, 0,
2285 RT_SCOPE_LINK);
2286 goto make_route;
2287 }
2288 if (!fl.fl4_src) {
2289 if (MULTICAST(oldflp->fl4_dst))
2290 fl.fl4_src = inet_select_addr(dev_out, 0,
2291 fl.fl4_scope);
2292 else if (!oldflp->fl4_dst)
2293 fl.fl4_src = inet_select_addr(dev_out, 0,
2294 RT_SCOPE_HOST);
2295 }
2296 }
2297
2298 if (!fl.fl4_dst) {
2299 fl.fl4_dst = fl.fl4_src;
2300 if (!fl.fl4_dst)
2301 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2302 if (dev_out)
2303 dev_put(dev_out);
Eric W. Biederman2774c7a2007-09-26 22:10:56 -07002304 dev_out = init_net.loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002305 dev_hold(dev_out);
Eric W. Biederman2774c7a2007-09-26 22:10:56 -07002306 fl.oif = init_net.loopback_dev->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002307 res.type = RTN_LOCAL;
2308 flags |= RTCF_LOCAL;
2309 goto make_route;
2310 }
2311
2312 if (fib_lookup(&fl, &res)) {
2313 res.fi = NULL;
2314 if (oldflp->oif) {
2315 /* Apparently, routing tables are wrong. Assume,
2316 that the destination is on link.
2317
2318 WHY? DW.
2319 Because we are allowed to send to iface
2320 even if it has NO routes and NO assigned
2321 addresses. When oif is specified, routing
2322 tables are looked up with only one purpose:
2323 to catch if destination is gatewayed, rather than
2324 direct. Moreover, if MSG_DONTROUTE is set,
2325 we send packet, ignoring both routing tables
2326 and ifaddr state. --ANK
2327
2328
2329 We could make it even if oif is unknown,
2330 likely IPv6, but we do not.
2331 */
2332
2333 if (fl.fl4_src == 0)
2334 fl.fl4_src = inet_select_addr(dev_out, 0,
2335 RT_SCOPE_LINK);
2336 res.type = RTN_UNICAST;
2337 goto make_route;
2338 }
2339 if (dev_out)
2340 dev_put(dev_out);
2341 err = -ENETUNREACH;
2342 goto out;
2343 }
2344 free_res = 1;
2345
2346 if (res.type == RTN_LOCAL) {
2347 if (!fl.fl4_src)
2348 fl.fl4_src = fl.fl4_dst;
2349 if (dev_out)
2350 dev_put(dev_out);
Eric W. Biederman2774c7a2007-09-26 22:10:56 -07002351 dev_out = init_net.loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002352 dev_hold(dev_out);
2353 fl.oif = dev_out->ifindex;
2354 if (res.fi)
2355 fib_info_put(res.fi);
2356 res.fi = NULL;
2357 flags |= RTCF_LOCAL;
2358 goto make_route;
2359 }
2360
2361#ifdef CONFIG_IP_ROUTE_MULTIPATH
2362 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2363 fib_select_multipath(&fl, &res);
2364 else
2365#endif
2366 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2367 fib_select_default(&fl, &res);
2368
2369 if (!fl.fl4_src)
2370 fl.fl4_src = FIB_RES_PREFSRC(res);
2371
2372 if (dev_out)
2373 dev_put(dev_out);
2374 dev_out = FIB_RES_DEV(res);
2375 dev_hold(dev_out);
2376 fl.oif = dev_out->ifindex;
2377
2378
2379make_route:
2380 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2381
2382
2383 if (free_res)
2384 fib_res_put(&res);
2385 if (dev_out)
2386 dev_put(dev_out);
2387out: return err;
2388}
2389
2390int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2391{
2392 unsigned hash;
2393 struct rtable *rth;
2394
Al Viro8c7bc842006-09-26 21:26:19 -07002395 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002396
2397 rcu_read_lock_bh();
2398 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08002399 rth = rcu_dereference(rth->u.dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002400 if (rth->fl.fl4_dst == flp->fl4_dst &&
2401 rth->fl.fl4_src == flp->fl4_src &&
2402 rth->fl.iif == 0 &&
2403 rth->fl.oif == flp->oif &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002404 rth->fl.mark == flp->mark &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002405 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2406 (IPTOS_RT_MASK | RTO_ONLINK))) {
Pavel Emelyanov03f49f32007-11-10 21:28:34 -08002407 dst_use(&rth->u.dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002408 RT_CACHE_STAT_INC(out_hit);
2409 rcu_read_unlock_bh();
2410 *rp = rth;
2411 return 0;
2412 }
2413 RT_CACHE_STAT_INC(out_hlist_search);
2414 }
2415 rcu_read_unlock_bh();
2416
2417 return ip_route_output_slow(rp, flp);
2418}
2419
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002420EXPORT_SYMBOL_GPL(__ip_route_output_key);
2421
David S. Miller14e50e52007-05-24 18:17:54 -07002422static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2423{
2424}
2425
2426static struct dst_ops ipv4_dst_blackhole_ops = {
2427 .family = AF_INET,
2428 .protocol = __constant_htons(ETH_P_IP),
2429 .destroy = ipv4_dst_destroy,
2430 .check = ipv4_dst_check,
2431 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2432 .entry_size = sizeof(struct rtable),
2433};
2434
2435
David S. Miller14e50e52007-05-24 18:17:54 -07002436static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2437{
2438 struct rtable *ort = *rp;
2439 struct rtable *rt = (struct rtable *)
2440 dst_alloc(&ipv4_dst_blackhole_ops);
2441
2442 if (rt) {
2443 struct dst_entry *new = &rt->u.dst;
2444
2445 atomic_set(&new->__refcnt, 1);
2446 new->__use = 1;
Herbert Xu352e5122007-11-13 21:34:06 -08002447 new->input = dst_discard;
2448 new->output = dst_discard;
David S. Miller14e50e52007-05-24 18:17:54 -07002449 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2450
2451 new->dev = ort->u.dst.dev;
2452 if (new->dev)
2453 dev_hold(new->dev);
2454
2455 rt->fl = ort->fl;
2456
2457 rt->idev = ort->idev;
2458 if (rt->idev)
2459 in_dev_hold(rt->idev);
2460 rt->rt_flags = ort->rt_flags;
2461 rt->rt_type = ort->rt_type;
2462 rt->rt_dst = ort->rt_dst;
2463 rt->rt_src = ort->rt_src;
2464 rt->rt_iif = ort->rt_iif;
2465 rt->rt_gateway = ort->rt_gateway;
2466 rt->rt_spec_dst = ort->rt_spec_dst;
2467 rt->peer = ort->peer;
2468 if (rt->peer)
2469 atomic_inc(&rt->peer->refcnt);
2470
2471 dst_free(new);
2472 }
2473
2474 dst_release(&(*rp)->u.dst);
2475 *rp = rt;
2476 return (rt ? 0 : -ENOMEM);
2477}
2478
Linus Torvalds1da177e2005-04-16 15:20:36 -07002479int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2480{
2481 int err;
2482
2483 if ((err = __ip_route_output_key(rp, flp)) != 0)
2484 return err;
2485
2486 if (flp->proto) {
2487 if (!flp->fl4_src)
2488 flp->fl4_src = (*rp)->rt_src;
2489 if (!flp->fl4_dst)
2490 flp->fl4_dst = (*rp)->rt_dst;
David S. Miller14e50e52007-05-24 18:17:54 -07002491 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2492 if (err == -EREMOTE)
2493 err = ipv4_dst_blackhole(rp, flp, sk);
2494
2495 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002496 }
2497
2498 return 0;
2499}
2500
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002501EXPORT_SYMBOL_GPL(ip_route_output_flow);
2502
Linus Torvalds1da177e2005-04-16 15:20:36 -07002503int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2504{
2505 return ip_route_output_flow(rp, flp, NULL, 0);
2506}
2507
2508static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002509 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002510{
2511 struct rtable *rt = (struct rtable*)skb->dst;
2512 struct rtmsg *r;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002513 struct nlmsghdr *nlh;
Thomas Grafe3703b32006-11-27 09:27:07 -08002514 long expires;
2515 u32 id = 0, ts = 0, tsage = 0, error;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002516
2517 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2518 if (nlh == NULL)
Patrick McHardy26932562007-01-31 23:16:40 -08002519 return -EMSGSIZE;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002520
2521 r = nlmsg_data(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002522 r->rtm_family = AF_INET;
2523 r->rtm_dst_len = 32;
2524 r->rtm_src_len = 0;
2525 r->rtm_tos = rt->fl.fl4_tos;
2526 r->rtm_table = RT_TABLE_MAIN;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002527 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002528 r->rtm_type = rt->rt_type;
2529 r->rtm_scope = RT_SCOPE_UNIVERSE;
2530 r->rtm_protocol = RTPROT_UNSPEC;
2531 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2532 if (rt->rt_flags & RTCF_NOTIFY)
2533 r->rtm_flags |= RTM_F_NOTIFY;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002534
Al Viro17fb2c62006-09-26 22:15:25 -07002535 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002536
Linus Torvalds1da177e2005-04-16 15:20:36 -07002537 if (rt->fl.fl4_src) {
2538 r->rtm_src_len = 32;
Al Viro17fb2c62006-09-26 22:15:25 -07002539 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002540 }
2541 if (rt->u.dst.dev)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002542 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002543#ifdef CONFIG_NET_CLS_ROUTE
2544 if (rt->u.dst.tclassid)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002545 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002546#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002547 if (rt->fl.iif)
Al Viro17fb2c62006-09-26 22:15:25 -07002548 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002549 else if (rt->rt_src != rt->fl.fl4_src)
Al Viro17fb2c62006-09-26 22:15:25 -07002550 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002551
Linus Torvalds1da177e2005-04-16 15:20:36 -07002552 if (rt->rt_dst != rt->rt_gateway)
Al Viro17fb2c62006-09-26 22:15:25 -07002553 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002554
Linus Torvalds1da177e2005-04-16 15:20:36 -07002555 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002556 goto nla_put_failure;
2557
Thomas Grafe3703b32006-11-27 09:27:07 -08002558 error = rt->u.dst.error;
2559 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002560 if (rt->peer) {
Thomas Grafe3703b32006-11-27 09:27:07 -08002561 id = rt->peer->ip_id_count;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002562 if (rt->peer->tcp_ts_stamp) {
Thomas Grafe3703b32006-11-27 09:27:07 -08002563 ts = rt->peer->tcp_ts;
James Morris9d729f72007-03-04 16:12:44 -08002564 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002565 }
2566 }
Thomas Grafbe403ea2006-08-17 18:15:17 -07002567
Linus Torvalds1da177e2005-04-16 15:20:36 -07002568 if (rt->fl.iif) {
2569#ifdef CONFIG_IP_MROUTE
Al Viroe4485152006-09-26 22:15:01 -07002570 __be32 dst = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002571
2572 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
Herbert Xu42f811b2007-06-04 23:34:44 -07002573 IPV4_DEVCONF_ALL(MC_FORWARDING)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002574 int err = ipmr_get_route(skb, r, nowait);
2575 if (err <= 0) {
2576 if (!nowait) {
2577 if (err == 0)
2578 return 0;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002579 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002580 } else {
2581 if (err == -EMSGSIZE)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002582 goto nla_put_failure;
Thomas Grafe3703b32006-11-27 09:27:07 -08002583 error = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002584 }
2585 }
2586 } else
2587#endif
Thomas Grafbe403ea2006-08-17 18:15:17 -07002588 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002589 }
2590
Thomas Grafe3703b32006-11-27 09:27:07 -08002591 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2592 expires, error) < 0)
2593 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002594
Thomas Grafbe403ea2006-08-17 18:15:17 -07002595 return nlmsg_end(skb, nlh);
2596
2597nla_put_failure:
Patrick McHardy26932562007-01-31 23:16:40 -08002598 nlmsg_cancel(skb, nlh);
2599 return -EMSGSIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002600}
2601
Thomas Graf63f34442007-03-22 11:55:17 -07002602static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002603{
Denis V. Lunevb8542722007-12-01 00:21:31 +11002604 struct net *net = in_skb->sk->sk_net;
Thomas Grafd889ce32006-08-17 18:15:44 -07002605 struct rtmsg *rtm;
2606 struct nlattr *tb[RTA_MAX+1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07002607 struct rtable *rt = NULL;
Al Viro9e12bb22006-09-26 21:25:20 -07002608 __be32 dst = 0;
2609 __be32 src = 0;
2610 u32 iif;
Thomas Grafd889ce32006-08-17 18:15:44 -07002611 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002612 struct sk_buff *skb;
2613
Denis V. Lunevb8542722007-12-01 00:21:31 +11002614 if (net != &init_net)
2615 return -EINVAL;
2616
Thomas Grafd889ce32006-08-17 18:15:44 -07002617 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2618 if (err < 0)
2619 goto errout;
2620
2621 rtm = nlmsg_data(nlh);
2622
Linus Torvalds1da177e2005-04-16 15:20:36 -07002623 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
Thomas Grafd889ce32006-08-17 18:15:44 -07002624 if (skb == NULL) {
2625 err = -ENOBUFS;
2626 goto errout;
2627 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002628
2629 /* Reserve room for dummy headers, this skb can pass
2630 through good chunk of routing engine.
2631 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07002632 skb_reset_mac_header(skb);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07002633 skb_reset_network_header(skb);
Stephen Hemmingerd2c962b2006-04-17 17:27:11 -07002634
2635 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07002636 ip_hdr(skb)->protocol = IPPROTO_ICMP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002637 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2638
Al Viro17fb2c62006-09-26 22:15:25 -07002639 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2640 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
Thomas Grafd889ce32006-08-17 18:15:44 -07002641 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002642
2643 if (iif) {
Thomas Grafd889ce32006-08-17 18:15:44 -07002644 struct net_device *dev;
2645
Eric W. Biederman881d9662007-09-17 11:56:21 -07002646 dev = __dev_get_by_index(&init_net, iif);
Thomas Grafd889ce32006-08-17 18:15:44 -07002647 if (dev == NULL) {
2648 err = -ENODEV;
2649 goto errout_free;
2650 }
2651
Linus Torvalds1da177e2005-04-16 15:20:36 -07002652 skb->protocol = htons(ETH_P_IP);
2653 skb->dev = dev;
2654 local_bh_disable();
2655 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2656 local_bh_enable();
Thomas Grafd889ce32006-08-17 18:15:44 -07002657
2658 rt = (struct rtable*) skb->dst;
2659 if (err == 0 && rt->u.dst.error)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002660 err = -rt->u.dst.error;
2661 } else {
Thomas Grafd889ce32006-08-17 18:15:44 -07002662 struct flowi fl = {
2663 .nl_u = {
2664 .ip4_u = {
2665 .daddr = dst,
2666 .saddr = src,
2667 .tos = rtm->rtm_tos,
2668 },
2669 },
2670 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2671 };
Linus Torvalds1da177e2005-04-16 15:20:36 -07002672 err = ip_route_output_key(&rt, &fl);
2673 }
Thomas Grafd889ce32006-08-17 18:15:44 -07002674
Linus Torvalds1da177e2005-04-16 15:20:36 -07002675 if (err)
Thomas Grafd889ce32006-08-17 18:15:44 -07002676 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002677
2678 skb->dst = &rt->u.dst;
2679 if (rtm->rtm_flags & RTM_F_NOTIFY)
2680 rt->rt_flags |= RTCF_NOTIFY;
2681
Linus Torvalds1da177e2005-04-16 15:20:36 -07002682 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002683 RTM_NEWROUTE, 0, 0);
Thomas Grafd889ce32006-08-17 18:15:44 -07002684 if (err <= 0)
2685 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002686
Denis V. Lunev97c53ca2007-11-19 22:26:51 -08002687 err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid);
Thomas Grafd889ce32006-08-17 18:15:44 -07002688errout:
Thomas Graf2942e902006-08-15 00:30:25 -07002689 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002690
Thomas Grafd889ce32006-08-17 18:15:44 -07002691errout_free:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002692 kfree_skb(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07002693 goto errout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002694}
2695
2696int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2697{
2698 struct rtable *rt;
2699 int h, s_h;
2700 int idx, s_idx;
2701
2702 s_h = cb->args[0];
Eric Dumazetd8c92832008-01-07 21:52:14 -08002703 if (s_h < 0)
2704 s_h = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002705 s_idx = idx = cb->args[1];
Eric Dumazetd8c92832008-01-07 21:52:14 -08002706 for (h = s_h; h <= rt_hash_mask; h++) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002707 rcu_read_lock_bh();
2708 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08002709 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002710 if (idx < s_idx)
2711 continue;
2712 skb->dst = dst_clone(&rt->u.dst);
2713 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002714 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002715 1, NLM_F_MULTI) <= 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002716 dst_release(xchg(&skb->dst, NULL));
2717 rcu_read_unlock_bh();
2718 goto done;
2719 }
2720 dst_release(xchg(&skb->dst, NULL));
2721 }
2722 rcu_read_unlock_bh();
Eric Dumazetd8c92832008-01-07 21:52:14 -08002723 s_idx = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002724 }
2725
2726done:
2727 cb->args[0] = h;
2728 cb->args[1] = idx;
2729 return skb->len;
2730}
2731
2732void ip_rt_multicast_event(struct in_device *in_dev)
2733{
2734 rt_cache_flush(0);
2735}
2736
2737#ifdef CONFIG_SYSCTL
2738static int flush_delay;
2739
2740static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2741 struct file *filp, void __user *buffer,
2742 size_t *lenp, loff_t *ppos)
2743{
2744 if (write) {
2745 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2746 rt_cache_flush(flush_delay);
2747 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002748 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002749
2750 return -EINVAL;
2751}
2752
2753static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2754 int __user *name,
2755 int nlen,
2756 void __user *oldval,
2757 size_t __user *oldlenp,
2758 void __user *newval,
Alexey Dobriyan1f29bcd2006-12-10 02:19:10 -08002759 size_t newlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002760{
2761 int delay;
2762 if (newlen != sizeof(int))
2763 return -EINVAL;
2764 if (get_user(delay, (int __user *)newval))
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002765 return -EFAULT;
2766 rt_cache_flush(delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002767 return 0;
2768}
2769
2770ctl_table ipv4_route_table[] = {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002771 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002772 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2773 .procname = "flush",
2774 .data = &flush_delay,
2775 .maxlen = sizeof(int),
Dave Jones7e3e0362005-04-28 12:11:03 -07002776 .mode = 0200,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002777 .proc_handler = &ipv4_sysctl_rtcache_flush,
2778 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2779 },
2780 {
2781 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2782 .procname = "min_delay",
2783 .data = &ip_rt_min_delay,
2784 .maxlen = sizeof(int),
2785 .mode = 0644,
2786 .proc_handler = &proc_dointvec_jiffies,
2787 .strategy = &sysctl_jiffies,
2788 },
2789 {
2790 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2791 .procname = "max_delay",
2792 .data = &ip_rt_max_delay,
2793 .maxlen = sizeof(int),
2794 .mode = 0644,
2795 .proc_handler = &proc_dointvec_jiffies,
2796 .strategy = &sysctl_jiffies,
2797 },
2798 {
2799 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2800 .procname = "gc_thresh",
2801 .data = &ipv4_dst_ops.gc_thresh,
2802 .maxlen = sizeof(int),
2803 .mode = 0644,
2804 .proc_handler = &proc_dointvec,
2805 },
2806 {
2807 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2808 .procname = "max_size",
2809 .data = &ip_rt_max_size,
2810 .maxlen = sizeof(int),
2811 .mode = 0644,
2812 .proc_handler = &proc_dointvec,
2813 },
2814 {
2815 /* Deprecated. Use gc_min_interval_ms */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002816
Linus Torvalds1da177e2005-04-16 15:20:36 -07002817 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2818 .procname = "gc_min_interval",
2819 .data = &ip_rt_gc_min_interval,
2820 .maxlen = sizeof(int),
2821 .mode = 0644,
2822 .proc_handler = &proc_dointvec_jiffies,
2823 .strategy = &sysctl_jiffies,
2824 },
2825 {
2826 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2827 .procname = "gc_min_interval_ms",
2828 .data = &ip_rt_gc_min_interval,
2829 .maxlen = sizeof(int),
2830 .mode = 0644,
2831 .proc_handler = &proc_dointvec_ms_jiffies,
2832 .strategy = &sysctl_ms_jiffies,
2833 },
2834 {
2835 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2836 .procname = "gc_timeout",
2837 .data = &ip_rt_gc_timeout,
2838 .maxlen = sizeof(int),
2839 .mode = 0644,
2840 .proc_handler = &proc_dointvec_jiffies,
2841 .strategy = &sysctl_jiffies,
2842 },
2843 {
2844 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2845 .procname = "gc_interval",
2846 .data = &ip_rt_gc_interval,
2847 .maxlen = sizeof(int),
2848 .mode = 0644,
2849 .proc_handler = &proc_dointvec_jiffies,
2850 .strategy = &sysctl_jiffies,
2851 },
2852 {
2853 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2854 .procname = "redirect_load",
2855 .data = &ip_rt_redirect_load,
2856 .maxlen = sizeof(int),
2857 .mode = 0644,
2858 .proc_handler = &proc_dointvec,
2859 },
2860 {
2861 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2862 .procname = "redirect_number",
2863 .data = &ip_rt_redirect_number,
2864 .maxlen = sizeof(int),
2865 .mode = 0644,
2866 .proc_handler = &proc_dointvec,
2867 },
2868 {
2869 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2870 .procname = "redirect_silence",
2871 .data = &ip_rt_redirect_silence,
2872 .maxlen = sizeof(int),
2873 .mode = 0644,
2874 .proc_handler = &proc_dointvec,
2875 },
2876 {
2877 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2878 .procname = "error_cost",
2879 .data = &ip_rt_error_cost,
2880 .maxlen = sizeof(int),
2881 .mode = 0644,
2882 .proc_handler = &proc_dointvec,
2883 },
2884 {
2885 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2886 .procname = "error_burst",
2887 .data = &ip_rt_error_burst,
2888 .maxlen = sizeof(int),
2889 .mode = 0644,
2890 .proc_handler = &proc_dointvec,
2891 },
2892 {
2893 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
2894 .procname = "gc_elasticity",
2895 .data = &ip_rt_gc_elasticity,
2896 .maxlen = sizeof(int),
2897 .mode = 0644,
2898 .proc_handler = &proc_dointvec,
2899 },
2900 {
2901 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
2902 .procname = "mtu_expires",
2903 .data = &ip_rt_mtu_expires,
2904 .maxlen = sizeof(int),
2905 .mode = 0644,
2906 .proc_handler = &proc_dointvec_jiffies,
2907 .strategy = &sysctl_jiffies,
2908 },
2909 {
2910 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
2911 .procname = "min_pmtu",
2912 .data = &ip_rt_min_pmtu,
2913 .maxlen = sizeof(int),
2914 .mode = 0644,
2915 .proc_handler = &proc_dointvec,
2916 },
2917 {
2918 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
2919 .procname = "min_adv_mss",
2920 .data = &ip_rt_min_advmss,
2921 .maxlen = sizeof(int),
2922 .mode = 0644,
2923 .proc_handler = &proc_dointvec,
2924 },
2925 {
2926 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
2927 .procname = "secret_interval",
2928 .data = &ip_rt_secret_interval,
2929 .maxlen = sizeof(int),
2930 .mode = 0644,
2931 .proc_handler = &proc_dointvec_jiffies,
2932 .strategy = &sysctl_jiffies,
2933 },
2934 { .ctl_name = 0 }
2935};
2936#endif
2937
2938#ifdef CONFIG_NET_CLS_ROUTE
Eric Dumazet8dbde282007-11-16 03:32:10 -08002939struct ip_rt_acct *ip_rt_acct __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002940#endif /* CONFIG_NET_CLS_ROUTE */
2941
2942static __initdata unsigned long rhash_entries;
2943static int __init set_rhash_entries(char *str)
2944{
2945 if (!str)
2946 return 0;
2947 rhash_entries = simple_strtoul(str, &str, 0);
2948 return 1;
2949}
2950__setup("rhash_entries=", set_rhash_entries);
2951
2952int __init ip_rt_init(void)
2953{
Eric Dumazet424c4b72005-07-05 14:58:19 -07002954 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002955
2956 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2957 (jiffies ^ (jiffies >> 7)));
2958
2959#ifdef CONFIG_NET_CLS_ROUTE
Eric Dumazet8dbde282007-11-16 03:32:10 -08002960 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002961 if (!ip_rt_acct)
2962 panic("IP: failed to allocate ip_rt_acct\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07002963#endif
2964
Alexey Dobriyane5d679f332006-08-26 19:25:52 -07002965 ipv4_dst_ops.kmem_cachep =
2966 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09002967 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002968
David S. Miller14e50e52007-05-24 18:17:54 -07002969 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2970
Eric Dumazet424c4b72005-07-05 14:58:19 -07002971 rt_hash_table = (struct rt_hash_bucket *)
2972 alloc_large_system_hash("IP route cache",
2973 sizeof(struct rt_hash_bucket),
2974 rhash_entries,
2975 (num_physpages >= 128 * 1024) ?
Mike Stroyan18955cf2005-11-29 16:12:55 -08002976 15 : 17,
Kirill Korotaev8d1502d2006-08-07 20:44:22 -07002977 0,
Eric Dumazet424c4b72005-07-05 14:58:19 -07002978 &rt_hash_log,
2979 &rt_hash_mask,
2980 0);
Eric Dumazet22c047c2005-07-05 14:55:24 -07002981 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
2982 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002983
2984 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2985 ip_rt_max_size = (rt_hash_mask + 1) * 16;
2986
Linus Torvalds1da177e2005-04-16 15:20:36 -07002987 devinet_init();
2988 ip_fib_init();
2989
Pavel Emelyanovb24b8a22008-01-23 21:20:07 -08002990 setup_timer(&rt_flush_timer, rt_run_flush, 0);
2991 setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002992
2993 /* All the timers, started at system startup tend
2994 to synchronize. Perturb it a bit.
2995 */
Eric Dumazet39c90ec2007-09-15 10:55:54 -07002996 schedule_delayed_work(&expires_work,
2997 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002998
2999 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3000 ip_rt_secret_interval;
3001 add_timer(&rt_secret_timer);
3002
3003#ifdef CONFIG_PROC_FS
3004 {
3005 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02003006 if (!proc_net_fops_create(&init_net, "rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003007 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02003008 init_net.proc_net_stat))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003009 return -ENOMEM;
3010 }
3011 rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3012 }
3013#ifdef CONFIG_NET_CLS_ROUTE
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02003014 create_proc_read_entry("rt_acct", 0, init_net.proc_net, ip_rt_acct_read, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003015#endif
3016#endif
3017#ifdef CONFIG_XFRM
3018 xfrm_init();
3019 xfrm4_init();
3020#endif
Thomas Graf63f34442007-03-22 11:55:17 -07003021 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3022
Linus Torvalds1da177e2005-04-16 15:20:36 -07003023 return rc;
3024}
3025
3026EXPORT_SYMBOL(__ip_select_ident);
3027EXPORT_SYMBOL(ip_route_input);
3028EXPORT_SYMBOL(ip_route_output_key);