blob: 163086b2f0584cc9a0d06e4af87144d31ceb562d [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9 *
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 *
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090023 * Alan Cox : Super /proc >4K
Linus Torvalds1da177e2005-04-16 15:20:36 -070024 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090041 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070042 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070057 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Ilia Sotnikovcef26852006-03-25 01:38:55 -080058 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
59 * Ilia Sotnikov : Removed TOS from hash calculations
Linus Torvalds1da177e2005-04-16 15:20:36 -070060 *
61 * This program is free software; you can redistribute it and/or
62 * modify it under the terms of the GNU General Public License
63 * as published by the Free Software Foundation; either version
64 * 2 of the License, or (at your option) any later version.
65 */
66
Linus Torvalds1da177e2005-04-16 15:20:36 -070067#include <linux/module.h>
68#include <asm/uaccess.h>
69#include <asm/system.h>
70#include <linux/bitops.h>
71#include <linux/types.h>
72#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070073#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070074#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070075#include <linux/string.h>
76#include <linux/socket.h>
77#include <linux/sockios.h>
78#include <linux/errno.h>
79#include <linux/in.h>
80#include <linux/inet.h>
81#include <linux/netdevice.h>
82#include <linux/proc_fs.h>
83#include <linux/init.h>
Eric Dumazet39c90ec2007-09-15 10:55:54 -070084#include <linux/workqueue.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070085#include <linux/skbuff.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070086#include <linux/inetdevice.h>
87#include <linux/igmp.h>
88#include <linux/pkt_sched.h>
89#include <linux/mroute.h>
90#include <linux/netfilter_ipv4.h>
91#include <linux/random.h>
92#include <linux/jhash.h>
93#include <linux/rcupdate.h>
94#include <linux/times.h>
Herbert Xu352e5122007-11-13 21:34:06 -080095#include <net/dst.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020096#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070097#include <net/protocol.h>
98#include <net/ip.h>
99#include <net/route.h>
100#include <net/inetpeer.h>
101#include <net/sock.h>
102#include <net/ip_fib.h>
103#include <net/arp.h>
104#include <net/tcp.h>
105#include <net/icmp.h>
106#include <net/xfrm.h>
Tom Tucker8d717402006-07-30 20:43:36 -0700107#include <net/netevent.h>
Thomas Graf63f34442007-03-22 11:55:17 -0700108#include <net/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700109#ifdef CONFIG_SYSCTL
110#include <linux/sysctl.h>
111#endif
112
113#define RT_FL_TOS(oldflp) \
114 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115
116#define IP_MAX_MTU 0xFFF0
117
118#define RT_GC_TIMEOUT (300*HZ)
119
120static int ip_rt_min_delay = 2 * HZ;
121static int ip_rt_max_delay = 10 * HZ;
122static int ip_rt_max_size;
123static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
124static int ip_rt_gc_interval = 60 * HZ;
125static int ip_rt_gc_min_interval = HZ / 2;
126static int ip_rt_redirect_number = 9;
127static int ip_rt_redirect_load = HZ / 50;
128static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
129static int ip_rt_error_cost = HZ;
130static int ip_rt_error_burst = 5 * HZ;
131static int ip_rt_gc_elasticity = 8;
132static int ip_rt_mtu_expires = 10 * 60 * HZ;
133static int ip_rt_min_pmtu = 512 + 20 + 20;
134static int ip_rt_min_advmss = 256;
135static int ip_rt_secret_interval = 10 * 60 * HZ;
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800136static int ip_rt_flush_expected;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700137static unsigned long rt_deadline;
138
139#define RTprint(a...) printk(KERN_DEBUG a)
140
141static struct timer_list rt_flush_timer;
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800142static void rt_worker_func(struct work_struct *work);
143static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700144static struct timer_list rt_secret_timer;
145
146/*
147 * Interface to generic destination cache.
148 */
149
150static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
151static void ipv4_dst_destroy(struct dst_entry *dst);
152static void ipv4_dst_ifdown(struct dst_entry *dst,
153 struct net_device *dev, int how);
154static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
155static void ipv4_link_failure(struct sk_buff *skb);
156static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
Daniel Lezcano569d3642008-01-18 03:56:57 -0800157static int rt_garbage_collect(struct dst_ops *ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700158
159
160static struct dst_ops ipv4_dst_ops = {
161 .family = AF_INET,
162 .protocol = __constant_htons(ETH_P_IP),
163 .gc = rt_garbage_collect,
164 .check = ipv4_dst_check,
165 .destroy = ipv4_dst_destroy,
166 .ifdown = ipv4_dst_ifdown,
167 .negative_advice = ipv4_negative_advice,
168 .link_failure = ipv4_link_failure,
169 .update_pmtu = ip_rt_update_pmtu,
Herbert Xu862b82c2007-11-13 21:43:11 -0800170 .local_out = ip_local_out,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700171 .entry_size = sizeof(struct rtable),
Eric Dumazete2422972008-01-30 20:07:45 -0800172 .entries = ATOMIC_INIT(0),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700173};
174
175#define ECN_OR_COST(class) TC_PRIO_##class
176
Philippe De Muyter4839c522007-07-09 15:32:57 -0700177const __u8 ip_tos2prio[16] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700178 TC_PRIO_BESTEFFORT,
179 ECN_OR_COST(FILLER),
180 TC_PRIO_BESTEFFORT,
181 ECN_OR_COST(BESTEFFORT),
182 TC_PRIO_BULK,
183 ECN_OR_COST(BULK),
184 TC_PRIO_BULK,
185 ECN_OR_COST(BULK),
186 TC_PRIO_INTERACTIVE,
187 ECN_OR_COST(INTERACTIVE),
188 TC_PRIO_INTERACTIVE,
189 ECN_OR_COST(INTERACTIVE),
190 TC_PRIO_INTERACTIVE_BULK,
191 ECN_OR_COST(INTERACTIVE_BULK),
192 TC_PRIO_INTERACTIVE_BULK,
193 ECN_OR_COST(INTERACTIVE_BULK)
194};
195
196
197/*
198 * Route cache.
199 */
200
201/* The locking scheme is rather straight forward:
202 *
203 * 1) Read-Copy Update protects the buckets of the central route hash.
204 * 2) Only writers remove entries, and they hold the lock
205 * as they look at rtable reference counts.
206 * 3) Only readers acquire references to rtable entries,
207 * they do so with atomic increments and with the
208 * lock held.
209 */
210
211struct rt_hash_bucket {
212 struct rtable *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700213};
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700214#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
215 defined(CONFIG_PROVE_LOCKING)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700216/*
217 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
218 * The size of this table is a power of two and depends on the number of CPUS.
Ingo Molnar62051202006-07-03 00:24:59 -0700219 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700220 */
Ingo Molnar62051202006-07-03 00:24:59 -0700221#ifdef CONFIG_LOCKDEP
222# define RT_HASH_LOCK_SZ 256
Eric Dumazet22c047c2005-07-05 14:55:24 -0700223#else
Ingo Molnar62051202006-07-03 00:24:59 -0700224# if NR_CPUS >= 32
225# define RT_HASH_LOCK_SZ 4096
226# elif NR_CPUS >= 16
227# define RT_HASH_LOCK_SZ 2048
228# elif NR_CPUS >= 8
229# define RT_HASH_LOCK_SZ 1024
230# elif NR_CPUS >= 4
231# define RT_HASH_LOCK_SZ 512
232# else
233# define RT_HASH_LOCK_SZ 256
234# endif
Eric Dumazet22c047c2005-07-05 14:55:24 -0700235#endif
236
237static spinlock_t *rt_hash_locks;
238# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800239
240static __init void rt_hash_lock_init(void)
241{
242 int i;
243
244 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
245 GFP_KERNEL);
246 if (!rt_hash_locks)
247 panic("IP: failed to allocate rt_hash_locks\n");
248
249 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
250 spin_lock_init(&rt_hash_locks[i]);
251}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700252#else
253# define rt_hash_lock_addr(slot) NULL
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800254
255static inline void rt_hash_lock_init(void)
256{
257}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700258#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700259
260static struct rt_hash_bucket *rt_hash_table;
261static unsigned rt_hash_mask;
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700262static unsigned int rt_hash_log;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700263static unsigned int rt_hash_rnd;
264
Eric Dumazet2f970d82006-01-17 02:54:36 -0800265static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
Andrew Mortondbd29152006-01-17 21:58:01 -0800266#define RT_CACHE_STAT_INC(field) \
Paul Mackerrasbfe5d832006-06-25 05:47:14 -0700267 (__raw_get_cpu_var(rt_cache_stat).field++)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700268
269static int rt_intern_hash(unsigned hash, struct rtable *rth,
270 struct rtable **res);
271
Ilia Sotnikovcef26852006-03-25 01:38:55 -0800272static unsigned int rt_hash_code(u32 daddr, u32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700273{
Ilia Sotnikovcef26852006-03-25 01:38:55 -0800274 return (jhash_2words(daddr, saddr, rt_hash_rnd)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700275 & rt_hash_mask);
276}
277
Al Viro8c7bc842006-09-26 21:26:19 -0700278#define rt_hash(daddr, saddr, idx) \
279 rt_hash_code((__force u32)(__be32)(daddr),\
280 (__force u32)(__be32)(saddr) ^ ((idx) << 5))
281
Linus Torvalds1da177e2005-04-16 15:20:36 -0700282#ifdef CONFIG_PROC_FS
283struct rt_cache_iter_state {
284 int bucket;
285};
286
287static struct rtable *rt_cache_get_first(struct seq_file *seq)
288{
289 struct rtable *r = NULL;
290 struct rt_cache_iter_state *st = seq->private;
291
292 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
293 rcu_read_lock_bh();
294 r = rt_hash_table[st->bucket].chain;
295 if (r)
296 break;
297 rcu_read_unlock_bh();
298 }
Eric Dumazet0bccead2008-01-10 03:55:57 -0800299 return rcu_dereference(r);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700300}
301
302static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
303{
Eric Dumazet0bccead2008-01-10 03:55:57 -0800304 struct rt_cache_iter_state *st = seq->private;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700305
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800306 r = r->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700307 while (!r) {
308 rcu_read_unlock_bh();
309 if (--st->bucket < 0)
310 break;
311 rcu_read_lock_bh();
312 r = rt_hash_table[st->bucket].chain;
313 }
Eric Dumazet0bccead2008-01-10 03:55:57 -0800314 return rcu_dereference(r);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700315}
316
317static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
318{
319 struct rtable *r = rt_cache_get_first(seq);
320
321 if (r)
322 while (pos && (r = rt_cache_get_next(seq, r)))
323 --pos;
324 return pos ? NULL : r;
325}
326
327static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
328{
329 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
330}
331
332static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
333{
334 struct rtable *r = NULL;
335
336 if (v == SEQ_START_TOKEN)
337 r = rt_cache_get_first(seq);
338 else
339 r = rt_cache_get_next(seq, v);
340 ++*pos;
341 return r;
342}
343
344static void rt_cache_seq_stop(struct seq_file *seq, void *v)
345{
346 if (v && v != SEQ_START_TOKEN)
347 rcu_read_unlock_bh();
348}
349
350static int rt_cache_seq_show(struct seq_file *seq, void *v)
351{
352 if (v == SEQ_START_TOKEN)
353 seq_printf(seq, "%-127s\n",
354 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
355 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
356 "HHUptod\tSpecDst");
357 else {
358 struct rtable *r = v;
359 char temp[256];
360
361 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
362 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
363 r->u.dst.dev ? r->u.dst.dev->name : "*",
364 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
365 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
366 r->u.dst.__use, 0, (unsigned long)r->rt_src,
367 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
368 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
369 dst_metric(&r->u.dst, RTAX_WINDOW),
370 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
371 dst_metric(&r->u.dst, RTAX_RTTVAR)),
372 r->fl.fl4_tos,
373 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
374 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
375 dev_queue_xmit) : 0,
376 r->rt_spec_dst);
377 seq_printf(seq, "%-127s\n", temp);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900378 }
379 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700380}
381
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700382static const struct seq_operations rt_cache_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700383 .start = rt_cache_seq_start,
384 .next = rt_cache_seq_next,
385 .stop = rt_cache_seq_stop,
386 .show = rt_cache_seq_show,
387};
388
389static int rt_cache_seq_open(struct inode *inode, struct file *file)
390{
Pavel Emelyanovcf7732e2007-10-10 02:29:29 -0700391 return seq_open_private(file, &rt_cache_seq_ops,
392 sizeof(struct rt_cache_iter_state));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700393}
394
Arjan van de Ven9a321442007-02-12 00:55:35 -0800395static const struct file_operations rt_cache_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700396 .owner = THIS_MODULE,
397 .open = rt_cache_seq_open,
398 .read = seq_read,
399 .llseek = seq_lseek,
400 .release = seq_release_private,
401};
402
403
404static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
405{
406 int cpu;
407
408 if (*pos == 0)
409 return SEQ_START_TOKEN;
410
411 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
412 if (!cpu_possible(cpu))
413 continue;
414 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800415 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700416 }
417 return NULL;
418}
419
420static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
421{
422 int cpu;
423
424 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
425 if (!cpu_possible(cpu))
426 continue;
427 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800428 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700429 }
430 return NULL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900431
Linus Torvalds1da177e2005-04-16 15:20:36 -0700432}
433
434static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
435{
436
437}
438
439static int rt_cpu_seq_show(struct seq_file *seq, void *v)
440{
441 struct rt_cache_stat *st = v;
442
443 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700444 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700445 return 0;
446 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900447
Linus Torvalds1da177e2005-04-16 15:20:36 -0700448 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
449 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
450 atomic_read(&ipv4_dst_ops.entries),
451 st->in_hit,
452 st->in_slow_tot,
453 st->in_slow_mc,
454 st->in_no_route,
455 st->in_brd,
456 st->in_martian_dst,
457 st->in_martian_src,
458
459 st->out_hit,
460 st->out_slow_tot,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900461 st->out_slow_mc,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700462
463 st->gc_total,
464 st->gc_ignored,
465 st->gc_goal_miss,
466 st->gc_dst_overflow,
467 st->in_hlist_search,
468 st->out_hlist_search
469 );
470 return 0;
471}
472
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700473static const struct seq_operations rt_cpu_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700474 .start = rt_cpu_seq_start,
475 .next = rt_cpu_seq_next,
476 .stop = rt_cpu_seq_stop,
477 .show = rt_cpu_seq_show,
478};
479
480
481static int rt_cpu_seq_open(struct inode *inode, struct file *file)
482{
483 return seq_open(file, &rt_cpu_seq_ops);
484}
485
Arjan van de Ven9a321442007-02-12 00:55:35 -0800486static const struct file_operations rt_cpu_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700487 .owner = THIS_MODULE,
488 .open = rt_cpu_seq_open,
489 .read = seq_read,
490 .llseek = seq_lseek,
491 .release = seq_release,
492};
493
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800494#ifdef CONFIG_NET_CLS_ROUTE
495static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
496 int length, int *eof, void *data)
497{
498 unsigned int i;
499
500 if ((offset & 3) || (length & 3))
501 return -EIO;
502
503 if (offset >= sizeof(struct ip_rt_acct) * 256) {
504 *eof = 1;
505 return 0;
506 }
507
508 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
509 length = sizeof(struct ip_rt_acct) * 256 - offset;
510 *eof = 1;
511 }
512
513 offset /= sizeof(u32);
514
515 if (length > 0) {
516 u32 *dst = (u32 *) buffer;
517
518 *start = buffer;
519 memset(dst, 0, length);
520
521 for_each_possible_cpu(i) {
522 unsigned int j;
523 u32 *src;
524
525 src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
526 for (j = 0; j < length/4; j++)
527 dst[j] += src[j];
528 }
529 }
530 return length;
531}
532#endif
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800533
534static __init int ip_rt_proc_init(struct net *net)
535{
536 struct proc_dir_entry *pde;
537
538 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
539 &rt_cache_seq_fops);
540 if (!pde)
541 goto err1;
542
543 pde = create_proc_entry("rt_cache", S_IRUGO, net->proc_net_stat);
544 if (!pde)
545 goto err2;
546
547 pde->proc_fops = &rt_cpu_seq_fops;
548
549#ifdef CONFIG_NET_CLS_ROUTE
550 pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
551 ip_rt_acct_read, NULL);
552 if (!pde)
553 goto err3;
554#endif
555 return 0;
556
557#ifdef CONFIG_NET_CLS_ROUTE
558err3:
559 remove_proc_entry("rt_cache", net->proc_net_stat);
560#endif
561err2:
562 remove_proc_entry("rt_cache", net->proc_net);
563err1:
564 return -ENOMEM;
565}
566#else
567static inline int ip_rt_proc_init(struct net *net)
568{
569 return 0;
570}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700571#endif /* CONFIG_PROC_FS */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900572
Linus Torvalds1da177e2005-04-16 15:20:36 -0700573static __inline__ void rt_free(struct rtable *rt)
574{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700575 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
576}
577
578static __inline__ void rt_drop(struct rtable *rt)
579{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700580 ip_rt_put(rt);
581 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
582}
583
584static __inline__ int rt_fast_clean(struct rtable *rth)
585{
586 /* Kill broadcast/multicast entries very aggresively, if they
587 collide in hash table with more useful entries */
588 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800589 rth->fl.iif && rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700590}
591
592static __inline__ int rt_valuable(struct rtable *rth)
593{
594 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
595 rth->u.dst.expires;
596}
597
598static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
599{
600 unsigned long age;
601 int ret = 0;
602
603 if (atomic_read(&rth->u.dst.__refcnt))
604 goto out;
605
606 ret = 1;
607 if (rth->u.dst.expires &&
608 time_after_eq(jiffies, rth->u.dst.expires))
609 goto out;
610
611 age = jiffies - rth->u.dst.lastuse;
612 ret = 0;
613 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
614 (age <= tmo2 && rt_valuable(rth)))
615 goto out;
616 ret = 1;
617out: return ret;
618}
619
620/* Bits of score are:
621 * 31: very valuable
622 * 30: not quite useless
623 * 29..0: usage counter
624 */
625static inline u32 rt_score(struct rtable *rt)
626{
627 u32 score = jiffies - rt->u.dst.lastuse;
628
629 score = ~score & ~(3<<30);
630
631 if (rt_valuable(rt))
632 score |= (1<<31);
633
634 if (!rt->fl.iif ||
635 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
636 score |= (1<<30);
637
638 return score;
639}
640
641static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
642{
Al Viro714e85b2006-11-14 20:51:49 -0800643 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
644 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
Thomas Graf47dcf0c2006-11-09 15:20:38 -0800645 (fl1->mark ^ fl2->mark) |
David S. Miller8238b212006-10-12 00:49:15 -0700646 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
647 *(u16 *)&fl2->nl_u.ip4_u.tos) |
648 (fl1->oif ^ fl2->oif) |
649 (fl1->iif ^ fl2->iif)) == 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700650}
651
Denis V. Lunevb5921912008-01-22 23:50:25 -0800652static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
653{
654 return rt1->u.dst.dev->nd_net == rt2->u.dst.dev->nd_net;
655}
656
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800657/*
658 * Perform a full scan of hash table and free all entries.
659 * Can be called by a softirq or a process.
660 * In the later case, we want to be reschedule if necessary
661 */
662static void rt_do_flush(int process_context)
663{
664 unsigned int i;
665 struct rtable *rth, *next;
666
667 for (i = 0; i <= rt_hash_mask; i++) {
668 if (process_context && need_resched())
669 cond_resched();
670 rth = rt_hash_table[i].chain;
671 if (!rth)
672 continue;
673
674 spin_lock_bh(rt_hash_lock_addr(i));
675 rth = rt_hash_table[i].chain;
676 rt_hash_table[i].chain = NULL;
677 spin_unlock_bh(rt_hash_lock_addr(i));
678
679 for (; rth; rth = next) {
680 next = rth->u.dst.rt_next;
681 rt_free(rth);
682 }
683 }
684}
685
686static void rt_check_expire(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700687{
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700688 static unsigned int rover;
689 unsigned int i = rover, goal;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700690 struct rtable *rth, **rthp;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700691 u64 mult;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700692
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700693 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
694 if (ip_rt_gc_timeout > 1)
695 do_div(mult, ip_rt_gc_timeout);
696 goal = (unsigned int)mult;
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700697 if (goal > rt_hash_mask)
698 goal = rt_hash_mask + 1;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700699 for (; goal > 0; goal--) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700700 unsigned long tmo = ip_rt_gc_timeout;
701
702 i = (i + 1) & rt_hash_mask;
703 rthp = &rt_hash_table[i].chain;
704
Eric Dumazetd90bf5a2007-11-14 16:14:05 -0800705 if (need_resched())
706 cond_resched();
707
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700708 if (*rthp == NULL)
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700709 continue;
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700710 spin_lock_bh(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700711 while ((rth = *rthp) != NULL) {
712 if (rth->u.dst.expires) {
713 /* Entry is expired even if it is in use */
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700714 if (time_before_eq(jiffies, rth->u.dst.expires)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700715 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800716 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700717 continue;
718 }
719 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
720 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800721 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700722 continue;
723 }
724
725 /* Cleanup aged off entries. */
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800726 *rthp = rth->u.dst.rt_next;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900727 rt_free(rth);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700728 }
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700729 spin_unlock_bh(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700730 }
731 rover = i;
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800732}
733
734/*
735 * rt_worker_func() is run in process context.
736 * If a whole flush was scheduled, it is done.
737 * Else, we call rt_check_expire() to scan part of the hash table
738 */
739static void rt_worker_func(struct work_struct *work)
740{
741 if (ip_rt_flush_expected) {
742 ip_rt_flush_expected = 0;
743 rt_do_flush(1);
744 } else
745 rt_check_expire();
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700746 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700747}
748
749/* This can run from both BH and non-BH contexts, the latter
750 * in the case of a forced flush event.
751 */
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800752static void rt_run_flush(unsigned long process_context)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700753{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700754 rt_deadline = 0;
755
756 get_random_bytes(&rt_hash_rnd, 4);
757
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800758 rt_do_flush(process_context);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700759}
760
761static DEFINE_SPINLOCK(rt_flush_lock);
762
763void rt_cache_flush(int delay)
764{
765 unsigned long now = jiffies;
766 int user_mode = !in_softirq();
767
768 if (delay < 0)
769 delay = ip_rt_min_delay;
770
Linus Torvalds1da177e2005-04-16 15:20:36 -0700771 spin_lock_bh(&rt_flush_lock);
772
773 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
774 long tmo = (long)(rt_deadline - now);
775
776 /* If flush timer is already running
777 and flush request is not immediate (delay > 0):
778
779 if deadline is not achieved, prolongate timer to "delay",
780 otherwise fire it at deadline time.
781 */
782
783 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
784 tmo = 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900785
Linus Torvalds1da177e2005-04-16 15:20:36 -0700786 if (delay > tmo)
787 delay = tmo;
788 }
789
790 if (delay <= 0) {
791 spin_unlock_bh(&rt_flush_lock);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800792 rt_run_flush(user_mode);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700793 return;
794 }
795
796 if (rt_deadline == 0)
797 rt_deadline = now + ip_rt_max_delay;
798
799 mod_timer(&rt_flush_timer, now+delay);
800 spin_unlock_bh(&rt_flush_lock);
801}
802
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800803/*
804 * We change rt_hash_rnd and ask next rt_worker_func() invocation
805 * to perform a flush in process context
806 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700807static void rt_secret_rebuild(unsigned long dummy)
808{
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800809 get_random_bytes(&rt_hash_rnd, 4);
810 ip_rt_flush_expected = 1;
811 cancel_delayed_work(&expires_work);
812 schedule_delayed_work(&expires_work, HZ/10);
813 mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700814}
815
816/*
817 Short description of GC goals.
818
819 We want to build algorithm, which will keep routing cache
820 at some equilibrium point, when number of aged off entries
821 is kept approximately equal to newly generated ones.
822
823 Current expiration strength is variable "expire".
824 We try to adjust it dynamically, so that if networking
825 is idle expires is large enough to keep enough of warm entries,
826 and when load increases it reduces to limit cache size.
827 */
828
Daniel Lezcano569d3642008-01-18 03:56:57 -0800829static int rt_garbage_collect(struct dst_ops *ops)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700830{
831 static unsigned long expire = RT_GC_TIMEOUT;
832 static unsigned long last_gc;
833 static int rover;
834 static int equilibrium;
835 struct rtable *rth, **rthp;
836 unsigned long now = jiffies;
837 int goal;
838
839 /*
840 * Garbage collection is pretty expensive,
841 * do not make it too frequently.
842 */
843
844 RT_CACHE_STAT_INC(gc_total);
845
846 if (now - last_gc < ip_rt_gc_min_interval &&
847 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
848 RT_CACHE_STAT_INC(gc_ignored);
849 goto out;
850 }
851
852 /* Calculate number of entries, which we want to expire now. */
853 goal = atomic_read(&ipv4_dst_ops.entries) -
854 (ip_rt_gc_elasticity << rt_hash_log);
855 if (goal <= 0) {
856 if (equilibrium < ipv4_dst_ops.gc_thresh)
857 equilibrium = ipv4_dst_ops.gc_thresh;
858 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
859 if (goal > 0) {
Eric Dumazetb790ced2007-12-21 01:49:07 -0800860 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700861 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
862 }
863 } else {
864 /* We are in dangerous area. Try to reduce cache really
865 * aggressively.
866 */
Eric Dumazetb790ced2007-12-21 01:49:07 -0800867 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700868 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
869 }
870
871 if (now - last_gc >= ip_rt_gc_min_interval)
872 last_gc = now;
873
874 if (goal <= 0) {
875 equilibrium += goal;
876 goto work_done;
877 }
878
879 do {
880 int i, k;
881
882 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
883 unsigned long tmo = expire;
884
885 k = (k + 1) & rt_hash_mask;
886 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700887 spin_lock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700888 while ((rth = *rthp) != NULL) {
889 if (!rt_may_expire(rth, tmo, expire)) {
890 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800891 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700892 continue;
893 }
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800894 *rthp = rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700895 rt_free(rth);
896 goal--;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700897 }
Eric Dumazet22c047c2005-07-05 14:55:24 -0700898 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700899 if (goal <= 0)
900 break;
901 }
902 rover = k;
903
904 if (goal <= 0)
905 goto work_done;
906
907 /* Goal is not achieved. We stop process if:
908
909 - if expire reduced to zero. Otherwise, expire is halfed.
910 - if table is not full.
911 - if we are called from interrupt.
912 - jiffies check is just fallback/debug loop breaker.
913 We will not spin here for long time in any case.
914 */
915
916 RT_CACHE_STAT_INC(gc_goal_miss);
917
918 if (expire == 0)
919 break;
920
921 expire >>= 1;
922#if RT_CACHE_DEBUG >= 2
923 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
924 atomic_read(&ipv4_dst_ops.entries), goal, i);
925#endif
926
927 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
928 goto out;
929 } while (!in_softirq() && time_before_eq(jiffies, now));
930
931 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
932 goto out;
933 if (net_ratelimit())
934 printk(KERN_WARNING "dst cache overflow\n");
935 RT_CACHE_STAT_INC(gc_dst_overflow);
936 return 1;
937
938work_done:
939 expire += ip_rt_gc_min_interval;
940 if (expire > ip_rt_gc_timeout ||
941 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
942 expire = ip_rt_gc_timeout;
943#if RT_CACHE_DEBUG >= 2
944 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
945 atomic_read(&ipv4_dst_ops.entries), goal, rover);
946#endif
947out: return 0;
948}
949
950static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
951{
952 struct rtable *rth, **rthp;
953 unsigned long now;
954 struct rtable *cand, **candp;
955 u32 min_score;
956 int chain_length;
957 int attempts = !in_softirq();
958
959restart:
960 chain_length = 0;
961 min_score = ~(u32)0;
962 cand = NULL;
963 candp = NULL;
964 now = jiffies;
965
966 rthp = &rt_hash_table[hash].chain;
967
Eric Dumazet22c047c2005-07-05 14:55:24 -0700968 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700969 while ((rth = *rthp) != NULL) {
Denis V. Lunevb5921912008-01-22 23:50:25 -0800970 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700971 /* Put it first */
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800972 *rthp = rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700973 /*
974 * Since lookup is lockfree, the deletion
975 * must be visible to another weakly ordered CPU before
976 * the insertion at the start of the hash chain.
977 */
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800978 rcu_assign_pointer(rth->u.dst.rt_next,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700979 rt_hash_table[hash].chain);
980 /*
981 * Since lookup is lockfree, the update writes
982 * must be ordered for consistency on SMP.
983 */
984 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
985
Pavel Emelyanov03f49f32007-11-10 21:28:34 -0800986 dst_use(&rth->u.dst, now);
Eric Dumazet22c047c2005-07-05 14:55:24 -0700987 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700988
989 rt_drop(rt);
990 *rp = rth;
991 return 0;
992 }
993
994 if (!atomic_read(&rth->u.dst.__refcnt)) {
995 u32 score = rt_score(rth);
996
997 if (score <= min_score) {
998 cand = rth;
999 candp = rthp;
1000 min_score = score;
1001 }
1002 }
1003
1004 chain_length++;
1005
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001006 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001007 }
1008
1009 if (cand) {
1010 /* ip_rt_gc_elasticity used to be average length of chain
1011 * length, when exceeded gc becomes really aggressive.
1012 *
1013 * The second limit is less certain. At the moment it allows
1014 * only 2 entries per bucket. We will see.
1015 */
1016 if (chain_length > ip_rt_gc_elasticity) {
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001017 *candp = cand->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001018 rt_free(cand);
1019 }
1020 }
1021
1022 /* Try to bind route to arp only if it is output
1023 route or unicast forwarding path.
1024 */
1025 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1026 int err = arp_bind_neighbour(&rt->u.dst);
1027 if (err) {
Eric Dumazet22c047c2005-07-05 14:55:24 -07001028 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001029
1030 if (err != -ENOBUFS) {
1031 rt_drop(rt);
1032 return err;
1033 }
1034
1035 /* Neighbour tables are full and nothing
1036 can be released. Try to shrink route cache,
1037 it is most likely it holds some neighbour records.
1038 */
1039 if (attempts-- > 0) {
1040 int saved_elasticity = ip_rt_gc_elasticity;
1041 int saved_int = ip_rt_gc_min_interval;
1042 ip_rt_gc_elasticity = 1;
1043 ip_rt_gc_min_interval = 0;
Daniel Lezcano569d3642008-01-18 03:56:57 -08001044 rt_garbage_collect(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001045 ip_rt_gc_min_interval = saved_int;
1046 ip_rt_gc_elasticity = saved_elasticity;
1047 goto restart;
1048 }
1049
1050 if (net_ratelimit())
1051 printk(KERN_WARNING "Neighbour table overflow.\n");
1052 rt_drop(rt);
1053 return -ENOBUFS;
1054 }
1055 }
1056
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001057 rt->u.dst.rt_next = rt_hash_table[hash].chain;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001058#if RT_CACHE_DEBUG >= 2
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001059 if (rt->u.dst.rt_next) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001060 struct rtable *trt;
1061 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1062 NIPQUAD(rt->rt_dst));
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001063 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001064 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1065 printk("\n");
1066 }
1067#endif
1068 rt_hash_table[hash].chain = rt;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001069 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001070 *rp = rt;
1071 return 0;
1072}
1073
1074void rt_bind_peer(struct rtable *rt, int create)
1075{
1076 static DEFINE_SPINLOCK(rt_peer_lock);
1077 struct inet_peer *peer;
1078
1079 peer = inet_getpeer(rt->rt_dst, create);
1080
1081 spin_lock_bh(&rt_peer_lock);
1082 if (rt->peer == NULL) {
1083 rt->peer = peer;
1084 peer = NULL;
1085 }
1086 spin_unlock_bh(&rt_peer_lock);
1087 if (peer)
1088 inet_putpeer(peer);
1089}
1090
1091/*
1092 * Peer allocation may fail only in serious out-of-memory conditions. However
1093 * we still can generate some output.
1094 * Random ID selection looks a bit dangerous because we have no chances to
1095 * select ID being unique in a reasonable period of time.
1096 * But broken packet identifier may be better than no packet at all.
1097 */
1098static void ip_select_fb_ident(struct iphdr *iph)
1099{
1100 static DEFINE_SPINLOCK(ip_fb_id_lock);
1101 static u32 ip_fallback_id;
1102 u32 salt;
1103
1104 spin_lock_bh(&ip_fb_id_lock);
Al Viroe4485152006-09-26 22:15:01 -07001105 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001106 iph->id = htons(salt & 0xFFFF);
1107 ip_fallback_id = salt;
1108 spin_unlock_bh(&ip_fb_id_lock);
1109}
1110
1111void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1112{
1113 struct rtable *rt = (struct rtable *) dst;
1114
1115 if (rt) {
1116 if (rt->peer == NULL)
1117 rt_bind_peer(rt, 1);
1118
1119 /* If peer is attached to destination, it is never detached,
1120 so that we need not to grab a lock to dereference it.
1121 */
1122 if (rt->peer) {
1123 iph->id = htons(inet_getid(rt->peer, more));
1124 return;
1125 }
1126 } else
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001127 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
Stephen Hemminger9c2b3322005-04-19 22:39:42 -07001128 __builtin_return_address(0));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001129
1130 ip_select_fb_ident(iph);
1131}
1132
1133static void rt_del(unsigned hash, struct rtable *rt)
1134{
1135 struct rtable **rthp;
1136
Eric Dumazet22c047c2005-07-05 14:55:24 -07001137 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001138 ip_rt_put(rt);
1139 for (rthp = &rt_hash_table[hash].chain; *rthp;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001140 rthp = &(*rthp)->u.dst.rt_next)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001141 if (*rthp == rt) {
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001142 *rthp = rt->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001143 rt_free(rt);
1144 break;
1145 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001146 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001147}
1148
Al Virof7655222006-09-26 21:25:43 -07001149void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1150 __be32 saddr, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001151{
1152 int i, k;
1153 struct in_device *in_dev = in_dev_get(dev);
1154 struct rtable *rth, **rthp;
Al Virof7655222006-09-26 21:25:43 -07001155 __be32 skeys[2] = { saddr, 0 };
Linus Torvalds1da177e2005-04-16 15:20:36 -07001156 int ikeys[2] = { dev->ifindex, 0 };
Tom Tucker8d717402006-07-30 20:43:36 -07001157 struct netevent_redirect netevent;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001158
Linus Torvalds1da177e2005-04-16 15:20:36 -07001159 if (!in_dev)
1160 return;
1161
1162 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001163 || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
Joe Perchesf97c1e02007-12-16 13:45:43 -08001164 || ipv4_is_zeronet(new_gw))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001165 goto reject_redirect;
1166
1167 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1168 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1169 goto reject_redirect;
1170 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1171 goto reject_redirect;
1172 } else {
Eric W. Biederman6b175b22008-01-10 03:25:28 -08001173 if (inet_addr_type(&init_net, new_gw) != RTN_UNICAST)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001174 goto reject_redirect;
1175 }
1176
1177 for (i = 0; i < 2; i++) {
1178 for (k = 0; k < 2; k++) {
Al Viro8c7bc842006-09-26 21:26:19 -07001179 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001180
1181 rthp=&rt_hash_table[hash].chain;
1182
1183 rcu_read_lock();
1184 while ((rth = rcu_dereference(*rthp)) != NULL) {
1185 struct rtable *rt;
1186
1187 if (rth->fl.fl4_dst != daddr ||
1188 rth->fl.fl4_src != skeys[i] ||
Linus Torvalds1da177e2005-04-16 15:20:36 -07001189 rth->fl.oif != ikeys[k] ||
1190 rth->fl.iif != 0) {
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001191 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001192 continue;
1193 }
1194
1195 if (rth->rt_dst != daddr ||
1196 rth->rt_src != saddr ||
1197 rth->u.dst.error ||
1198 rth->rt_gateway != old_gw ||
1199 rth->u.dst.dev != dev)
1200 break;
1201
1202 dst_hold(&rth->u.dst);
1203 rcu_read_unlock();
1204
1205 rt = dst_alloc(&ipv4_dst_ops);
1206 if (rt == NULL) {
1207 ip_rt_put(rth);
1208 in_dev_put(in_dev);
1209 return;
1210 }
1211
1212 /* Copy all the information. */
1213 *rt = *rth;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001214 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001215 rt->u.dst.__use = 1;
1216 atomic_set(&rt->u.dst.__refcnt, 1);
1217 rt->u.dst.child = NULL;
1218 if (rt->u.dst.dev)
1219 dev_hold(rt->u.dst.dev);
1220 if (rt->idev)
1221 in_dev_hold(rt->idev);
1222 rt->u.dst.obsolete = 0;
1223 rt->u.dst.lastuse = jiffies;
1224 rt->u.dst.path = &rt->u.dst;
1225 rt->u.dst.neighbour = NULL;
1226 rt->u.dst.hh = NULL;
1227 rt->u.dst.xfrm = NULL;
1228
1229 rt->rt_flags |= RTCF_REDIRECTED;
1230
1231 /* Gateway is different ... */
1232 rt->rt_gateway = new_gw;
1233
1234 /* Redirect received -> path was valid */
1235 dst_confirm(&rth->u.dst);
1236
1237 if (rt->peer)
1238 atomic_inc(&rt->peer->refcnt);
1239
1240 if (arp_bind_neighbour(&rt->u.dst) ||
1241 !(rt->u.dst.neighbour->nud_state &
1242 NUD_VALID)) {
1243 if (rt->u.dst.neighbour)
1244 neigh_event_send(rt->u.dst.neighbour, NULL);
1245 ip_rt_put(rth);
1246 rt_drop(rt);
1247 goto do_next;
1248 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001249
Tom Tucker8d717402006-07-30 20:43:36 -07001250 netevent.old = &rth->u.dst;
1251 netevent.new = &rt->u.dst;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001252 call_netevent_notifiers(NETEVENT_REDIRECT,
1253 &netevent);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001254
1255 rt_del(hash, rth);
1256 if (!rt_intern_hash(hash, rt, &rt))
1257 ip_rt_put(rt);
1258 goto do_next;
1259 }
1260 rcu_read_unlock();
1261 do_next:
1262 ;
1263 }
1264 }
1265 in_dev_put(in_dev);
1266 return;
1267
1268reject_redirect:
1269#ifdef CONFIG_IP_ROUTE_VERBOSE
1270 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1271 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1272 "%u.%u.%u.%u ignored.\n"
Ilia Sotnikovcef26852006-03-25 01:38:55 -08001273 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07001274 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
Ilia Sotnikovcef26852006-03-25 01:38:55 -08001275 NIPQUAD(saddr), NIPQUAD(daddr));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001276#endif
1277 in_dev_put(in_dev);
1278}
1279
1280static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1281{
1282 struct rtable *rt = (struct rtable*)dst;
1283 struct dst_entry *ret = dst;
1284
1285 if (rt) {
1286 if (dst->obsolete) {
1287 ip_rt_put(rt);
1288 ret = NULL;
1289 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1290 rt->u.dst.expires) {
Al Viro8c7bc842006-09-26 21:26:19 -07001291 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1292 rt->fl.oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001293#if RT_CACHE_DEBUG >= 1
Denis V. Lunev56c99d02007-12-06 02:19:07 -08001294 printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
Linus Torvalds1da177e2005-04-16 15:20:36 -07001295 "%u.%u.%u.%u/%02x dropped\n",
1296 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1297#endif
1298 rt_del(hash, rt);
1299 ret = NULL;
1300 }
1301 }
1302 return ret;
1303}
1304
1305/*
1306 * Algorithm:
1307 * 1. The first ip_rt_redirect_number redirects are sent
1308 * with exponential backoff, then we stop sending them at all,
1309 * assuming that the host ignores our redirects.
1310 * 2. If we did not see packets requiring redirects
1311 * during ip_rt_redirect_silence, we assume that the host
1312 * forgot redirected route and start to send redirects again.
1313 *
1314 * This algorithm is much cheaper and more intelligent than dumb load limiting
1315 * in icmp.c.
1316 *
1317 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1318 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1319 */
1320
1321void ip_rt_send_redirect(struct sk_buff *skb)
1322{
1323 struct rtable *rt = (struct rtable*)skb->dst;
1324 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1325
1326 if (!in_dev)
1327 return;
1328
1329 if (!IN_DEV_TX_REDIRECTS(in_dev))
1330 goto out;
1331
1332 /* No redirected packets during ip_rt_redirect_silence;
1333 * reset the algorithm.
1334 */
1335 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1336 rt->u.dst.rate_tokens = 0;
1337
1338 /* Too many ignored redirects; do not send anything
1339 * set u.dst.rate_last to the last seen redirected packet.
1340 */
1341 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1342 rt->u.dst.rate_last = jiffies;
1343 goto out;
1344 }
1345
1346 /* Check for load limit; set rate_last to the latest sent
1347 * redirect.
1348 */
Li Yewang14fb8a72006-12-18 00:26:35 -08001349 if (rt->u.dst.rate_tokens == 0 ||
1350 time_after(jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001351 (rt->u.dst.rate_last +
1352 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1353 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1354 rt->u.dst.rate_last = jiffies;
1355 ++rt->u.dst.rate_tokens;
1356#ifdef CONFIG_IP_ROUTE_VERBOSE
1357 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1358 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1359 net_ratelimit())
1360 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1361 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1362 NIPQUAD(rt->rt_src), rt->rt_iif,
1363 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1364#endif
1365 }
1366out:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001367 in_dev_put(in_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001368}
1369
1370static int ip_error(struct sk_buff *skb)
1371{
1372 struct rtable *rt = (struct rtable*)skb->dst;
1373 unsigned long now;
1374 int code;
1375
1376 switch (rt->u.dst.error) {
1377 case EINVAL:
1378 default:
1379 goto out;
1380 case EHOSTUNREACH:
1381 code = ICMP_HOST_UNREACH;
1382 break;
1383 case ENETUNREACH:
1384 code = ICMP_NET_UNREACH;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08001385 IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001386 break;
1387 case EACCES:
1388 code = ICMP_PKT_FILTERED;
1389 break;
1390 }
1391
1392 now = jiffies;
1393 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1394 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1395 rt->u.dst.rate_tokens = ip_rt_error_burst;
1396 rt->u.dst.rate_last = now;
1397 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1398 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1399 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1400 }
1401
1402out: kfree_skb(skb);
1403 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001404}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001405
1406/*
1407 * The last two values are not from the RFC but
1408 * are needed for AMPRnet AX.25 paths.
1409 */
1410
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001411static const unsigned short mtu_plateau[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -07001412{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1413
1414static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1415{
1416 int i;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001417
Linus Torvalds1da177e2005-04-16 15:20:36 -07001418 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1419 if (old_mtu > mtu_plateau[i])
1420 return mtu_plateau[i];
1421 return 68;
1422}
1423
Denis V. Lunevb5921912008-01-22 23:50:25 -08001424unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1425 unsigned short new_mtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001426{
1427 int i;
1428 unsigned short old_mtu = ntohs(iph->tot_len);
1429 struct rtable *rth;
Al Viroe4485152006-09-26 22:15:01 -07001430 __be32 skeys[2] = { iph->saddr, 0, };
1431 __be32 daddr = iph->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001432 unsigned short est_mtu = 0;
1433
1434 if (ipv4_config.no_pmtu_disc)
1435 return 0;
1436
1437 for (i = 0; i < 2; i++) {
Al Viro8c7bc842006-09-26 21:26:19 -07001438 unsigned hash = rt_hash(daddr, skeys[i], 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001439
1440 rcu_read_lock();
1441 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001442 rth = rcu_dereference(rth->u.dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001443 if (rth->fl.fl4_dst == daddr &&
1444 rth->fl.fl4_src == skeys[i] &&
1445 rth->rt_dst == daddr &&
1446 rth->rt_src == iph->saddr &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001447 rth->fl.iif == 0 &&
Denis V. Lunevb5921912008-01-22 23:50:25 -08001448 !(dst_metric_locked(&rth->u.dst, RTAX_MTU)) &&
1449 rth->u.dst.dev->nd_net == net) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001450 unsigned short mtu = new_mtu;
1451
1452 if (new_mtu < 68 || new_mtu >= old_mtu) {
1453
1454 /* BSD 4.2 compatibility hack :-( */
1455 if (mtu == 0 &&
1456 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1457 old_mtu >= 68 + (iph->ihl << 2))
1458 old_mtu -= iph->ihl << 2;
1459
1460 mtu = guess_mtu(old_mtu);
1461 }
1462 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001463 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001464 dst_confirm(&rth->u.dst);
1465 if (mtu < ip_rt_min_pmtu) {
1466 mtu = ip_rt_min_pmtu;
1467 rth->u.dst.metrics[RTAX_LOCK-1] |=
1468 (1 << RTAX_MTU);
1469 }
1470 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1471 dst_set_expires(&rth->u.dst,
1472 ip_rt_mtu_expires);
1473 }
1474 est_mtu = mtu;
1475 }
1476 }
1477 }
1478 rcu_read_unlock();
1479 }
1480 return est_mtu ? : new_mtu;
1481}
1482
1483static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1484{
1485 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1486 !(dst_metric_locked(dst, RTAX_MTU))) {
1487 if (mtu < ip_rt_min_pmtu) {
1488 mtu = ip_rt_min_pmtu;
1489 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1490 }
1491 dst->metrics[RTAX_MTU-1] = mtu;
1492 dst_set_expires(dst, ip_rt_mtu_expires);
Tom Tucker8d717402006-07-30 20:43:36 -07001493 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001494 }
1495}
1496
1497static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1498{
1499 return NULL;
1500}
1501
1502static void ipv4_dst_destroy(struct dst_entry *dst)
1503{
1504 struct rtable *rt = (struct rtable *) dst;
1505 struct inet_peer *peer = rt->peer;
1506 struct in_device *idev = rt->idev;
1507
1508 if (peer) {
1509 rt->peer = NULL;
1510 inet_putpeer(peer);
1511 }
1512
1513 if (idev) {
1514 rt->idev = NULL;
1515 in_dev_put(idev);
1516 }
1517}
1518
1519static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1520 int how)
1521{
1522 struct rtable *rt = (struct rtable *) dst;
1523 struct in_device *idev = rt->idev;
Denis V. Lunev5a3e55d2007-12-07 00:38:10 -08001524 if (dev != dev->nd_net->loopback_dev && idev && idev->dev == dev) {
1525 struct in_device *loopback_idev =
1526 in_dev_get(dev->nd_net->loopback_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001527 if (loopback_idev) {
1528 rt->idev = loopback_idev;
1529 in_dev_put(idev);
1530 }
1531 }
1532}
1533
1534static void ipv4_link_failure(struct sk_buff *skb)
1535{
1536 struct rtable *rt;
1537
1538 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1539
1540 rt = (struct rtable *) skb->dst;
1541 if (rt)
1542 dst_set_expires(&rt->u.dst, 0);
1543}
1544
1545static int ip_rt_bug(struct sk_buff *skb)
1546{
1547 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001548 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
Linus Torvalds1da177e2005-04-16 15:20:36 -07001549 skb->dev ? skb->dev->name : "?");
1550 kfree_skb(skb);
1551 return 0;
1552}
1553
1554/*
1555 We do not cache source address of outgoing interface,
1556 because it is used only by IP RR, TS and SRR options,
1557 so that it out of fast path.
1558
1559 BTW remember: "addr" is allowed to be not aligned
1560 in IP options!
1561 */
1562
1563void ip_rt_get_source(u8 *addr, struct rtable *rt)
1564{
Al Viroa61ced52006-09-26 21:27:54 -07001565 __be32 src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001566 struct fib_result res;
1567
1568 if (rt->fl.iif == 0)
1569 src = rt->rt_src;
Denis V. Lunevecfdc8c2008-01-21 17:35:06 -08001570 else if (fib_lookup(rt->u.dst.dev->nd_net, &rt->fl, &res) == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001571 src = FIB_RES_PREFSRC(res);
1572 fib_res_put(&res);
1573 } else
1574 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1575 RT_SCOPE_UNIVERSE);
1576 memcpy(addr, &src, 4);
1577}
1578
1579#ifdef CONFIG_NET_CLS_ROUTE
1580static void set_class_tag(struct rtable *rt, u32 tag)
1581{
1582 if (!(rt->u.dst.tclassid & 0xFFFF))
1583 rt->u.dst.tclassid |= tag & 0xFFFF;
1584 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1585 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1586}
1587#endif
1588
1589static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1590{
1591 struct fib_info *fi = res->fi;
1592
1593 if (fi) {
1594 if (FIB_RES_GW(*res) &&
1595 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1596 rt->rt_gateway = FIB_RES_GW(*res);
1597 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1598 sizeof(rt->u.dst.metrics));
1599 if (fi->fib_mtu == 0) {
1600 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1601 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1602 rt->rt_gateway != rt->rt_dst &&
1603 rt->u.dst.dev->mtu > 576)
1604 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1605 }
1606#ifdef CONFIG_NET_CLS_ROUTE
1607 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1608#endif
1609 } else
1610 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1611
1612 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1613 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1614 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1615 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1616 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1617 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1618 ip_rt_min_advmss);
1619 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1620 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1621
1622#ifdef CONFIG_NET_CLS_ROUTE
1623#ifdef CONFIG_IP_MULTIPLE_TABLES
1624 set_class_tag(rt, fib_rules_tclass(res));
1625#endif
1626 set_class_tag(rt, itag);
1627#endif
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001628 rt->rt_type = res->type;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001629}
1630
Al Viro9e12bb22006-09-26 21:25:20 -07001631static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001632 u8 tos, struct net_device *dev, int our)
1633{
1634 unsigned hash;
1635 struct rtable *rth;
Al Viroa61ced52006-09-26 21:27:54 -07001636 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001637 struct in_device *in_dev = in_dev_get(dev);
1638 u32 itag = 0;
1639
1640 /* Primary sanity checks. */
1641
1642 if (in_dev == NULL)
1643 return -EINVAL;
1644
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001645 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08001646 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001647 goto e_inval;
1648
Joe Perchesf97c1e02007-12-16 13:45:43 -08001649 if (ipv4_is_zeronet(saddr)) {
1650 if (!ipv4_is_local_multicast(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001651 goto e_inval;
1652 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1653 } else if (fib_validate_source(saddr, 0, tos, 0,
1654 dev, &spec_dst, &itag) < 0)
1655 goto e_inval;
1656
1657 rth = dst_alloc(&ipv4_dst_ops);
1658 if (!rth)
1659 goto e_nobufs;
1660
1661 rth->u.dst.output= ip_rt_bug;
1662
1663 atomic_set(&rth->u.dst.__refcnt, 1);
1664 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07001665 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001666 rth->u.dst.flags |= DST_NOPOLICY;
1667 rth->fl.fl4_dst = daddr;
1668 rth->rt_dst = daddr;
1669 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001670 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001671 rth->fl.fl4_src = saddr;
1672 rth->rt_src = saddr;
1673#ifdef CONFIG_NET_CLS_ROUTE
1674 rth->u.dst.tclassid = itag;
1675#endif
1676 rth->rt_iif =
1677 rth->fl.iif = dev->ifindex;
Eric W. Biederman2774c7a2007-09-26 22:10:56 -07001678 rth->u.dst.dev = init_net.loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001679 dev_hold(rth->u.dst.dev);
1680 rth->idev = in_dev_get(rth->u.dst.dev);
1681 rth->fl.oif = 0;
1682 rth->rt_gateway = daddr;
1683 rth->rt_spec_dst= spec_dst;
1684 rth->rt_type = RTN_MULTICAST;
1685 rth->rt_flags = RTCF_MULTICAST;
1686 if (our) {
1687 rth->u.dst.input= ip_local_deliver;
1688 rth->rt_flags |= RTCF_LOCAL;
1689 }
1690
1691#ifdef CONFIG_IP_MROUTE
Joe Perchesf97c1e02007-12-16 13:45:43 -08001692 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001693 rth->u.dst.input = ip_mr_input;
1694#endif
1695 RT_CACHE_STAT_INC(in_slow_mc);
1696
1697 in_dev_put(in_dev);
Al Viro8c7bc842006-09-26 21:26:19 -07001698 hash = rt_hash(daddr, saddr, dev->ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001699 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1700
1701e_nobufs:
1702 in_dev_put(in_dev);
1703 return -ENOBUFS;
1704
1705e_inval:
1706 in_dev_put(in_dev);
1707 return -EINVAL;
1708}
1709
1710
1711static void ip_handle_martian_source(struct net_device *dev,
1712 struct in_device *in_dev,
1713 struct sk_buff *skb,
Al Viro9e12bb22006-09-26 21:25:20 -07001714 __be32 daddr,
1715 __be32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001716{
1717 RT_CACHE_STAT_INC(in_martian_src);
1718#ifdef CONFIG_IP_ROUTE_VERBOSE
1719 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1720 /*
1721 * RFC1812 recommendation, if source is martian,
1722 * the only hint is MAC header.
1723 */
1724 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1725 "%u.%u.%u.%u, on dev %s\n",
1726 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001727 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001728 int i;
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001729 const unsigned char *p = skb_mac_header(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001730 printk(KERN_WARNING "ll header: ");
1731 for (i = 0; i < dev->hard_header_len; i++, p++) {
1732 printk("%02x", *p);
1733 if (i < (dev->hard_header_len - 1))
1734 printk(":");
1735 }
1736 printk("\n");
1737 }
1738 }
1739#endif
1740}
1741
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001742static inline int __mkroute_input(struct sk_buff *skb,
1743 struct fib_result* res,
1744 struct in_device *in_dev,
Al Viro9e12bb22006-09-26 21:25:20 -07001745 __be32 daddr, __be32 saddr, u32 tos,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001746 struct rtable **result)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001747{
1748
1749 struct rtable *rth;
1750 int err;
1751 struct in_device *out_dev;
1752 unsigned flags = 0;
Al Virod9c9df82006-09-26 21:28:14 -07001753 __be32 spec_dst;
1754 u32 itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001755
1756 /* get a working reference to the output device */
1757 out_dev = in_dev_get(FIB_RES_DEV(*res));
1758 if (out_dev == NULL) {
1759 if (net_ratelimit())
1760 printk(KERN_CRIT "Bug in ip_route_input" \
1761 "_slow(). Please, report\n");
1762 return -EINVAL;
1763 }
1764
1765
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001766 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
Linus Torvalds1da177e2005-04-16 15:20:36 -07001767 in_dev->dev, &spec_dst, &itag);
1768 if (err < 0) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001769 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001770 saddr);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001771
Linus Torvalds1da177e2005-04-16 15:20:36 -07001772 err = -EINVAL;
1773 goto cleanup;
1774 }
1775
1776 if (err)
1777 flags |= RTCF_DIRECTSRC;
1778
Rami Rosencb7928a2008-01-09 00:18:24 -08001779 if (out_dev == in_dev && err && !(flags & RTCF_MASQ) &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001780 (IN_DEV_SHARED_MEDIA(out_dev) ||
1781 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1782 flags |= RTCF_DOREDIRECT;
1783
1784 if (skb->protocol != htons(ETH_P_IP)) {
1785 /* Not IP (i.e. ARP). Do not create route, if it is
1786 * invalid for proxy arp. DNAT routes are always valid.
1787 */
Rami Rosencb7928a2008-01-09 00:18:24 -08001788 if (out_dev == in_dev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001789 err = -EINVAL;
1790 goto cleanup;
1791 }
1792 }
1793
1794
1795 rth = dst_alloc(&ipv4_dst_ops);
1796 if (!rth) {
1797 err = -ENOBUFS;
1798 goto cleanup;
1799 }
1800
Julian Anastasovce723d82005-09-08 13:34:47 -07001801 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001802 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07001803 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001804 rth->u.dst.flags |= DST_NOPOLICY;
Herbert Xu42f811b2007-06-04 23:34:44 -07001805 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001806 rth->u.dst.flags |= DST_NOXFRM;
1807 rth->fl.fl4_dst = daddr;
1808 rth->rt_dst = daddr;
1809 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001810 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001811 rth->fl.fl4_src = saddr;
1812 rth->rt_src = saddr;
1813 rth->rt_gateway = daddr;
1814 rth->rt_iif =
1815 rth->fl.iif = in_dev->dev->ifindex;
1816 rth->u.dst.dev = (out_dev)->dev;
1817 dev_hold(rth->u.dst.dev);
1818 rth->idev = in_dev_get(rth->u.dst.dev);
1819 rth->fl.oif = 0;
1820 rth->rt_spec_dst= spec_dst;
1821
1822 rth->u.dst.input = ip_forward;
1823 rth->u.dst.output = ip_output;
1824
1825 rt_set_nexthop(rth, res, itag);
1826
1827 rth->rt_flags = flags;
1828
1829 *result = rth;
1830 err = 0;
1831 cleanup:
1832 /* release the working reference to the output device */
1833 in_dev_put(out_dev);
1834 return err;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001835}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001836
David S. Millere06e7c62007-06-10 17:22:39 -07001837static inline int ip_mkroute_input(struct sk_buff *skb,
1838 struct fib_result* res,
1839 const struct flowi *fl,
1840 struct in_device *in_dev,
1841 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001842{
Chuck Short7abaa272005-06-22 22:10:23 -07001843 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001844 int err;
1845 unsigned hash;
1846
1847#ifdef CONFIG_IP_ROUTE_MULTIPATH
1848 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1849 fib_select_multipath(fl, res);
1850#endif
1851
1852 /* create a routing cache entry */
1853 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1854 if (err)
1855 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001856
1857 /* put it into the cache */
Al Viro8c7bc842006-09-26 21:26:19 -07001858 hash = rt_hash(daddr, saddr, fl->iif);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001859 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001860}
1861
Linus Torvalds1da177e2005-04-16 15:20:36 -07001862/*
1863 * NOTE. We drop all the packets that has local source
1864 * addresses, because every properly looped back packet
1865 * must have correct destination already attached by output routine.
1866 *
1867 * Such approach solves two big problems:
1868 * 1. Not simplex devices are handled properly.
1869 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1870 */
1871
Al Viro9e12bb22006-09-26 21:25:20 -07001872static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001873 u8 tos, struct net_device *dev)
1874{
1875 struct fib_result res;
1876 struct in_device *in_dev = in_dev_get(dev);
1877 struct flowi fl = { .nl_u = { .ip4_u =
1878 { .daddr = daddr,
1879 .saddr = saddr,
1880 .tos = tos,
1881 .scope = RT_SCOPE_UNIVERSE,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001882 } },
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001883 .mark = skb->mark,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001884 .iif = dev->ifindex };
1885 unsigned flags = 0;
1886 u32 itag = 0;
1887 struct rtable * rth;
1888 unsigned hash;
Al Viro9e12bb22006-09-26 21:25:20 -07001889 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001890 int err = -EINVAL;
1891 int free_res = 0;
Denis V. Lunev84a885f2008-01-21 17:34:35 -08001892 struct net * net = dev->nd_net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001893
1894 /* IP on this device is disabled. */
1895
1896 if (!in_dev)
1897 goto out;
1898
1899 /* Check for the most weird martians, which can be not detected
1900 by fib_lookup.
1901 */
1902
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001903 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08001904 ipv4_is_loopback(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001905 goto martian_source;
1906
Al Viroe4485152006-09-26 22:15:01 -07001907 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001908 goto brd_input;
1909
1910 /* Accept zero addresses only to limited broadcast;
1911 * I even do not know to fix it or not. Waiting for complains :-)
1912 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08001913 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001914 goto martian_source;
1915
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001916 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08001917 ipv4_is_loopback(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001918 goto martian_destination;
1919
1920 /*
1921 * Now we are ready to route packet.
1922 */
Denis V. Lunev84a885f2008-01-21 17:34:35 -08001923 if ((err = fib_lookup(net, &fl, &res)) != 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001924 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001925 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001926 goto no_route;
1927 }
1928 free_res = 1;
1929
1930 RT_CACHE_STAT_INC(in_slow_tot);
1931
1932 if (res.type == RTN_BROADCAST)
1933 goto brd_input;
1934
1935 if (res.type == RTN_LOCAL) {
1936 int result;
1937 result = fib_validate_source(saddr, daddr, tos,
Denis V. Lunev84a885f2008-01-21 17:34:35 -08001938 net->loopback_dev->ifindex,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001939 dev, &spec_dst, &itag);
1940 if (result < 0)
1941 goto martian_source;
1942 if (result)
1943 flags |= RTCF_DIRECTSRC;
1944 spec_dst = daddr;
1945 goto local_input;
1946 }
1947
1948 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001949 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001950 if (res.type != RTN_UNICAST)
1951 goto martian_destination;
1952
1953 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001954done:
1955 in_dev_put(in_dev);
1956 if (free_res)
1957 fib_res_put(&res);
1958out: return err;
1959
1960brd_input:
1961 if (skb->protocol != htons(ETH_P_IP))
1962 goto e_inval;
1963
Joe Perchesf97c1e02007-12-16 13:45:43 -08001964 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001965 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1966 else {
1967 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1968 &itag);
1969 if (err < 0)
1970 goto martian_source;
1971 if (err)
1972 flags |= RTCF_DIRECTSRC;
1973 }
1974 flags |= RTCF_BROADCAST;
1975 res.type = RTN_BROADCAST;
1976 RT_CACHE_STAT_INC(in_brd);
1977
1978local_input:
1979 rth = dst_alloc(&ipv4_dst_ops);
1980 if (!rth)
1981 goto e_nobufs;
1982
1983 rth->u.dst.output= ip_rt_bug;
1984
1985 atomic_set(&rth->u.dst.__refcnt, 1);
1986 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07001987 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001988 rth->u.dst.flags |= DST_NOPOLICY;
1989 rth->fl.fl4_dst = daddr;
1990 rth->rt_dst = daddr;
1991 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001992 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001993 rth->fl.fl4_src = saddr;
1994 rth->rt_src = saddr;
1995#ifdef CONFIG_NET_CLS_ROUTE
1996 rth->u.dst.tclassid = itag;
1997#endif
1998 rth->rt_iif =
1999 rth->fl.iif = dev->ifindex;
Denis V. Lunev84a885f2008-01-21 17:34:35 -08002000 rth->u.dst.dev = net->loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002001 dev_hold(rth->u.dst.dev);
2002 rth->idev = in_dev_get(rth->u.dst.dev);
2003 rth->rt_gateway = daddr;
2004 rth->rt_spec_dst= spec_dst;
2005 rth->u.dst.input= ip_local_deliver;
2006 rth->rt_flags = flags|RTCF_LOCAL;
2007 if (res.type == RTN_UNREACHABLE) {
2008 rth->u.dst.input= ip_error;
2009 rth->u.dst.error= -err;
2010 rth->rt_flags &= ~RTCF_LOCAL;
2011 }
2012 rth->rt_type = res.type;
Al Viro8c7bc842006-09-26 21:26:19 -07002013 hash = rt_hash(daddr, saddr, fl.iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002014 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2015 goto done;
2016
2017no_route:
2018 RT_CACHE_STAT_INC(in_no_route);
2019 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2020 res.type = RTN_UNREACHABLE;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08002021 if (err == -ESRCH)
2022 err = -ENETUNREACH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002023 goto local_input;
2024
2025 /*
2026 * Do not cache martian addresses: they should be logged (RFC1812)
2027 */
2028martian_destination:
2029 RT_CACHE_STAT_INC(in_martian_dst);
2030#ifdef CONFIG_IP_ROUTE_VERBOSE
2031 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2032 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2033 "%u.%u.%u.%u, dev %s\n",
2034 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2035#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002036
2037e_hostunreach:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002038 err = -EHOSTUNREACH;
2039 goto done;
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002040
Linus Torvalds1da177e2005-04-16 15:20:36 -07002041e_inval:
2042 err = -EINVAL;
2043 goto done;
2044
2045e_nobufs:
2046 err = -ENOBUFS;
2047 goto done;
2048
2049martian_source:
2050 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2051 goto e_inval;
2052}
2053
Al Viro9e12bb22006-09-26 21:25:20 -07002054int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002055 u8 tos, struct net_device *dev)
2056{
2057 struct rtable * rth;
2058 unsigned hash;
2059 int iif = dev->ifindex;
Denis V. Lunevb5921912008-01-22 23:50:25 -08002060 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002061
Denis V. Lunevb5921912008-01-22 23:50:25 -08002062 net = skb->dev->nd_net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002063 tos &= IPTOS_RT_MASK;
Al Viro8c7bc842006-09-26 21:26:19 -07002064 hash = rt_hash(daddr, saddr, iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002065
2066 rcu_read_lock();
2067 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08002068 rth = rcu_dereference(rth->u.dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002069 if (rth->fl.fl4_dst == daddr &&
2070 rth->fl.fl4_src == saddr &&
2071 rth->fl.iif == iif &&
2072 rth->fl.oif == 0 &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002073 rth->fl.mark == skb->mark &&
Denis V. Lunevb5921912008-01-22 23:50:25 -08002074 rth->fl.fl4_tos == tos &&
2075 rth->u.dst.dev->nd_net == net) {
Pavel Emelyanov03f49f32007-11-10 21:28:34 -08002076 dst_use(&rth->u.dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002077 RT_CACHE_STAT_INC(in_hit);
2078 rcu_read_unlock();
2079 skb->dst = (struct dst_entry*)rth;
2080 return 0;
2081 }
2082 RT_CACHE_STAT_INC(in_hlist_search);
2083 }
2084 rcu_read_unlock();
2085
2086 /* Multicast recognition logic is moved from route cache to here.
2087 The problem was that too many Ethernet cards have broken/missing
2088 hardware multicast filters :-( As result the host on multicasting
2089 network acquires a lot of useless route cache entries, sort of
2090 SDR messages from all the world. Now we try to get rid of them.
2091 Really, provided software IP multicast filter is organized
2092 reasonably (at least, hashed), it does not result in a slowdown
2093 comparing with route cache reject entries.
2094 Note, that multicast routers are not affected, because
2095 route cache entry is created eventually.
2096 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002097 if (ipv4_is_multicast(daddr)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002098 struct in_device *in_dev;
2099
2100 rcu_read_lock();
Herbert Xue5ed6392005-10-03 14:35:55 -07002101 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002102 int our = ip_check_mc(in_dev, daddr, saddr,
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07002103 ip_hdr(skb)->protocol);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002104 if (our
2105#ifdef CONFIG_IP_MROUTE
Joe Perchesf97c1e02007-12-16 13:45:43 -08002106 || (!ipv4_is_local_multicast(daddr) &&
2107 IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002108#endif
2109 ) {
2110 rcu_read_unlock();
2111 return ip_route_input_mc(skb, daddr, saddr,
2112 tos, dev, our);
2113 }
2114 }
2115 rcu_read_unlock();
2116 return -EINVAL;
2117 }
2118 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2119}
2120
2121static inline int __mkroute_output(struct rtable **result,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002122 struct fib_result* res,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002123 const struct flowi *fl,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002124 const struct flowi *oldflp,
2125 struct net_device *dev_out,
2126 unsigned flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002127{
2128 struct rtable *rth;
2129 struct in_device *in_dev;
2130 u32 tos = RT_FL_TOS(oldflp);
2131 int err = 0;
2132
Joe Perchesf97c1e02007-12-16 13:45:43 -08002133 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002134 return -EINVAL;
2135
Al Viroe4485152006-09-26 22:15:01 -07002136 if (fl->fl4_dst == htonl(0xFFFFFFFF))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002137 res->type = RTN_BROADCAST;
Joe Perchesf97c1e02007-12-16 13:45:43 -08002138 else if (ipv4_is_multicast(fl->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002139 res->type = RTN_MULTICAST;
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002140 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002141 return -EINVAL;
2142
2143 if (dev_out->flags & IFF_LOOPBACK)
2144 flags |= RTCF_LOCAL;
2145
2146 /* get work reference to inet device */
2147 in_dev = in_dev_get(dev_out);
2148 if (!in_dev)
2149 return -EINVAL;
2150
2151 if (res->type == RTN_BROADCAST) {
2152 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2153 if (res->fi) {
2154 fib_info_put(res->fi);
2155 res->fi = NULL;
2156 }
2157 } else if (res->type == RTN_MULTICAST) {
2158 flags |= RTCF_MULTICAST|RTCF_LOCAL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002159 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002160 oldflp->proto))
2161 flags &= ~RTCF_LOCAL;
2162 /* If multicast route do not exist use
2163 default one, but do not gateway in this case.
2164 Yes, it is hack.
2165 */
2166 if (res->fi && res->prefixlen < 4) {
2167 fib_info_put(res->fi);
2168 res->fi = NULL;
2169 }
2170 }
2171
2172
2173 rth = dst_alloc(&ipv4_dst_ops);
2174 if (!rth) {
2175 err = -ENOBUFS;
2176 goto cleanup;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002177 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002178
Julian Anastasovce723d82005-09-08 13:34:47 -07002179 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002180 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07002181 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002182 rth->u.dst.flags |= DST_NOXFRM;
Herbert Xu42f811b2007-06-04 23:34:44 -07002183 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002184 rth->u.dst.flags |= DST_NOPOLICY;
2185
2186 rth->fl.fl4_dst = oldflp->fl4_dst;
2187 rth->fl.fl4_tos = tos;
2188 rth->fl.fl4_src = oldflp->fl4_src;
2189 rth->fl.oif = oldflp->oif;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002190 rth->fl.mark = oldflp->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002191 rth->rt_dst = fl->fl4_dst;
2192 rth->rt_src = fl->fl4_src;
2193 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002194 /* get references to the devices that are to be hold by the routing
Linus Torvalds1da177e2005-04-16 15:20:36 -07002195 cache entry */
2196 rth->u.dst.dev = dev_out;
2197 dev_hold(dev_out);
2198 rth->idev = in_dev_get(dev_out);
2199 rth->rt_gateway = fl->fl4_dst;
2200 rth->rt_spec_dst= fl->fl4_src;
2201
2202 rth->u.dst.output=ip_output;
2203
2204 RT_CACHE_STAT_INC(out_slow_tot);
2205
2206 if (flags & RTCF_LOCAL) {
2207 rth->u.dst.input = ip_local_deliver;
2208 rth->rt_spec_dst = fl->fl4_dst;
2209 }
2210 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2211 rth->rt_spec_dst = fl->fl4_src;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002212 if (flags & RTCF_LOCAL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002213 !(dev_out->flags & IFF_LOOPBACK)) {
2214 rth->u.dst.output = ip_mc_output;
2215 RT_CACHE_STAT_INC(out_slow_mc);
2216 }
2217#ifdef CONFIG_IP_MROUTE
2218 if (res->type == RTN_MULTICAST) {
2219 if (IN_DEV_MFORWARD(in_dev) &&
Joe Perchesf97c1e02007-12-16 13:45:43 -08002220 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002221 rth->u.dst.input = ip_mr_input;
2222 rth->u.dst.output = ip_mc_output;
2223 }
2224 }
2225#endif
2226 }
2227
2228 rt_set_nexthop(rth, res, 0);
2229
2230 rth->rt_flags = flags;
2231
2232 *result = rth;
2233 cleanup:
2234 /* release work reference to inet device */
2235 in_dev_put(in_dev);
2236
2237 return err;
2238}
2239
David S. Millere06e7c62007-06-10 17:22:39 -07002240static inline int ip_mkroute_output(struct rtable **rp,
2241 struct fib_result* res,
2242 const struct flowi *fl,
2243 const struct flowi *oldflp,
2244 struct net_device *dev_out,
2245 unsigned flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002246{
Chuck Short7abaa272005-06-22 22:10:23 -07002247 struct rtable *rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002248 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2249 unsigned hash;
2250 if (err == 0) {
Al Viro8c7bc842006-09-26 21:26:19 -07002251 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002252 err = rt_intern_hash(hash, rth, rp);
2253 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002254
Linus Torvalds1da177e2005-04-16 15:20:36 -07002255 return err;
2256}
2257
Linus Torvalds1da177e2005-04-16 15:20:36 -07002258/*
2259 * Major route resolver routine.
2260 */
2261
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002262static int ip_route_output_slow(struct net *net, struct rtable **rp,
2263 const struct flowi *oldflp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002264{
2265 u32 tos = RT_FL_TOS(oldflp);
2266 struct flowi fl = { .nl_u = { .ip4_u =
2267 { .daddr = oldflp->fl4_dst,
2268 .saddr = oldflp->fl4_src,
2269 .tos = tos & IPTOS_RT_MASK,
2270 .scope = ((tos & RTO_ONLINK) ?
2271 RT_SCOPE_LINK :
2272 RT_SCOPE_UNIVERSE),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002273 } },
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002274 .mark = oldflp->mark,
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002275 .iif = net->loopback_dev->ifindex,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002276 .oif = oldflp->oif };
2277 struct fib_result res;
2278 unsigned flags = 0;
2279 struct net_device *dev_out = NULL;
2280 int free_res = 0;
2281 int err;
2282
2283
2284 res.fi = NULL;
2285#ifdef CONFIG_IP_MULTIPLE_TABLES
2286 res.r = NULL;
2287#endif
2288
2289 if (oldflp->fl4_src) {
2290 err = -EINVAL;
Joe Perchesf97c1e02007-12-16 13:45:43 -08002291 if (ipv4_is_multicast(oldflp->fl4_src) ||
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002292 ipv4_is_lbcast(oldflp->fl4_src) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002293 ipv4_is_zeronet(oldflp->fl4_src))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002294 goto out;
2295
2296 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002297 dev_out = ip_dev_find(net, oldflp->fl4_src);
David S. Millerf6c5d732007-05-18 02:07:50 -07002298 if (dev_out == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002299 goto out;
2300
2301 /* I removed check for oif == dev_out->oif here.
2302 It was wrong for two reasons:
Denis V. Lunev1ab35272008-01-22 22:04:30 -08002303 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2304 is assigned to multiple interfaces.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002305 2. Moreover, we are allowed to send packets with saddr
2306 of another iface. --ANK
2307 */
2308
David S. Millerf6c5d732007-05-18 02:07:50 -07002309 if (oldflp->oif == 0
Joe Perchesf97c1e02007-12-16 13:45:43 -08002310 && (ipv4_is_multicast(oldflp->fl4_dst) ||
2311 oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002312 /* Special hack: user can direct multicasts
2313 and limited broadcast via necessary interface
2314 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2315 This hack is not just for fun, it allows
2316 vic,vat and friends to work.
2317 They bind socket to loopback, set ttl to zero
2318 and expect that it will work.
2319 From the viewpoint of routing cache they are broken,
2320 because we are not allowed to build multicast path
2321 with loopback source addr (look, routing cache
2322 cannot know, that ttl is zero, so that packet
2323 will not leave this host and route is valid).
2324 Luckily, this hack is good workaround.
2325 */
2326
2327 fl.oif = dev_out->ifindex;
2328 goto make_route;
2329 }
2330 if (dev_out)
2331 dev_put(dev_out);
2332 dev_out = NULL;
2333 }
2334
2335
2336 if (oldflp->oif) {
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002337 dev_out = dev_get_by_index(net, oldflp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002338 err = -ENODEV;
2339 if (dev_out == NULL)
2340 goto out;
Herbert Xue5ed6392005-10-03 14:35:55 -07002341
2342 /* RACE: Check return value of inet_select_addr instead. */
2343 if (__in_dev_get_rtnl(dev_out) == NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002344 dev_put(dev_out);
2345 goto out; /* Wrong error code */
2346 }
2347
Joe Perchesf97c1e02007-12-16 13:45:43 -08002348 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2349 oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002350 if (!fl.fl4_src)
2351 fl.fl4_src = inet_select_addr(dev_out, 0,
2352 RT_SCOPE_LINK);
2353 goto make_route;
2354 }
2355 if (!fl.fl4_src) {
Joe Perchesf97c1e02007-12-16 13:45:43 -08002356 if (ipv4_is_multicast(oldflp->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002357 fl.fl4_src = inet_select_addr(dev_out, 0,
2358 fl.fl4_scope);
2359 else if (!oldflp->fl4_dst)
2360 fl.fl4_src = inet_select_addr(dev_out, 0,
2361 RT_SCOPE_HOST);
2362 }
2363 }
2364
2365 if (!fl.fl4_dst) {
2366 fl.fl4_dst = fl.fl4_src;
2367 if (!fl.fl4_dst)
2368 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2369 if (dev_out)
2370 dev_put(dev_out);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002371 dev_out = net->loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002372 dev_hold(dev_out);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002373 fl.oif = net->loopback_dev->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002374 res.type = RTN_LOCAL;
2375 flags |= RTCF_LOCAL;
2376 goto make_route;
2377 }
2378
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002379 if (fib_lookup(net, &fl, &res)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002380 res.fi = NULL;
2381 if (oldflp->oif) {
2382 /* Apparently, routing tables are wrong. Assume,
2383 that the destination is on link.
2384
2385 WHY? DW.
2386 Because we are allowed to send to iface
2387 even if it has NO routes and NO assigned
2388 addresses. When oif is specified, routing
2389 tables are looked up with only one purpose:
2390 to catch if destination is gatewayed, rather than
2391 direct. Moreover, if MSG_DONTROUTE is set,
2392 we send packet, ignoring both routing tables
2393 and ifaddr state. --ANK
2394
2395
2396 We could make it even if oif is unknown,
2397 likely IPv6, but we do not.
2398 */
2399
2400 if (fl.fl4_src == 0)
2401 fl.fl4_src = inet_select_addr(dev_out, 0,
2402 RT_SCOPE_LINK);
2403 res.type = RTN_UNICAST;
2404 goto make_route;
2405 }
2406 if (dev_out)
2407 dev_put(dev_out);
2408 err = -ENETUNREACH;
2409 goto out;
2410 }
2411 free_res = 1;
2412
2413 if (res.type == RTN_LOCAL) {
2414 if (!fl.fl4_src)
2415 fl.fl4_src = fl.fl4_dst;
2416 if (dev_out)
2417 dev_put(dev_out);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002418 dev_out = net->loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002419 dev_hold(dev_out);
2420 fl.oif = dev_out->ifindex;
2421 if (res.fi)
2422 fib_info_put(res.fi);
2423 res.fi = NULL;
2424 flags |= RTCF_LOCAL;
2425 goto make_route;
2426 }
2427
2428#ifdef CONFIG_IP_ROUTE_MULTIPATH
2429 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2430 fib_select_multipath(&fl, &res);
2431 else
2432#endif
2433 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002434 fib_select_default(net, &fl, &res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002435
2436 if (!fl.fl4_src)
2437 fl.fl4_src = FIB_RES_PREFSRC(res);
2438
2439 if (dev_out)
2440 dev_put(dev_out);
2441 dev_out = FIB_RES_DEV(res);
2442 dev_hold(dev_out);
2443 fl.oif = dev_out->ifindex;
2444
2445
2446make_route:
2447 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2448
2449
2450 if (free_res)
2451 fib_res_put(&res);
2452 if (dev_out)
2453 dev_put(dev_out);
2454out: return err;
2455}
2456
Denis V. Lunev611c1832008-01-22 22:06:48 -08002457int __ip_route_output_key(struct net *net, struct rtable **rp,
2458 const struct flowi *flp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002459{
2460 unsigned hash;
2461 struct rtable *rth;
2462
Al Viro8c7bc842006-09-26 21:26:19 -07002463 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002464
2465 rcu_read_lock_bh();
2466 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08002467 rth = rcu_dereference(rth->u.dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002468 if (rth->fl.fl4_dst == flp->fl4_dst &&
2469 rth->fl.fl4_src == flp->fl4_src &&
2470 rth->fl.iif == 0 &&
2471 rth->fl.oif == flp->oif &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002472 rth->fl.mark == flp->mark &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002473 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
Denis V. Lunevb5921912008-01-22 23:50:25 -08002474 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2475 rth->u.dst.dev->nd_net == net) {
Pavel Emelyanov03f49f32007-11-10 21:28:34 -08002476 dst_use(&rth->u.dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002477 RT_CACHE_STAT_INC(out_hit);
2478 rcu_read_unlock_bh();
2479 *rp = rth;
2480 return 0;
2481 }
2482 RT_CACHE_STAT_INC(out_hlist_search);
2483 }
2484 rcu_read_unlock_bh();
2485
Denis V. Lunev611c1832008-01-22 22:06:48 -08002486 return ip_route_output_slow(net, rp, flp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002487}
2488
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002489EXPORT_SYMBOL_GPL(__ip_route_output_key);
2490
David S. Miller14e50e52007-05-24 18:17:54 -07002491static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2492{
2493}
2494
2495static struct dst_ops ipv4_dst_blackhole_ops = {
2496 .family = AF_INET,
2497 .protocol = __constant_htons(ETH_P_IP),
2498 .destroy = ipv4_dst_destroy,
2499 .check = ipv4_dst_check,
2500 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2501 .entry_size = sizeof(struct rtable),
Eric Dumazete2422972008-01-30 20:07:45 -08002502 .entries = ATOMIC_INIT(0),
David S. Miller14e50e52007-05-24 18:17:54 -07002503};
2504
2505
David S. Miller14e50e52007-05-24 18:17:54 -07002506static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2507{
2508 struct rtable *ort = *rp;
2509 struct rtable *rt = (struct rtable *)
2510 dst_alloc(&ipv4_dst_blackhole_ops);
2511
2512 if (rt) {
2513 struct dst_entry *new = &rt->u.dst;
2514
2515 atomic_set(&new->__refcnt, 1);
2516 new->__use = 1;
Herbert Xu352e5122007-11-13 21:34:06 -08002517 new->input = dst_discard;
2518 new->output = dst_discard;
David S. Miller14e50e52007-05-24 18:17:54 -07002519 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2520
2521 new->dev = ort->u.dst.dev;
2522 if (new->dev)
2523 dev_hold(new->dev);
2524
2525 rt->fl = ort->fl;
2526
2527 rt->idev = ort->idev;
2528 if (rt->idev)
2529 in_dev_hold(rt->idev);
2530 rt->rt_flags = ort->rt_flags;
2531 rt->rt_type = ort->rt_type;
2532 rt->rt_dst = ort->rt_dst;
2533 rt->rt_src = ort->rt_src;
2534 rt->rt_iif = ort->rt_iif;
2535 rt->rt_gateway = ort->rt_gateway;
2536 rt->rt_spec_dst = ort->rt_spec_dst;
2537 rt->peer = ort->peer;
2538 if (rt->peer)
2539 atomic_inc(&rt->peer->refcnt);
2540
2541 dst_free(new);
2542 }
2543
2544 dst_release(&(*rp)->u.dst);
2545 *rp = rt;
2546 return (rt ? 0 : -ENOMEM);
2547}
2548
Denis V. Lunevf1b050b2008-01-22 22:07:10 -08002549int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2550 struct sock *sk, int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002551{
2552 int err;
2553
Denis V. Lunevf1b050b2008-01-22 22:07:10 -08002554 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002555 return err;
2556
2557 if (flp->proto) {
2558 if (!flp->fl4_src)
2559 flp->fl4_src = (*rp)->rt_src;
2560 if (!flp->fl4_dst)
2561 flp->fl4_dst = (*rp)->rt_dst;
Herbert Xubb728452007-12-12 18:48:58 -08002562 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2563 flags ? XFRM_LOOKUP_WAIT : 0);
David S. Miller14e50e52007-05-24 18:17:54 -07002564 if (err == -EREMOTE)
2565 err = ipv4_dst_blackhole(rp, flp, sk);
2566
2567 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002568 }
2569
2570 return 0;
2571}
2572
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002573EXPORT_SYMBOL_GPL(ip_route_output_flow);
2574
Denis V. Lunevf2063512008-01-22 22:07:34 -08002575int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002576{
Denis V. Lunevf2063512008-01-22 22:07:34 -08002577 return ip_route_output_flow(net, rp, flp, NULL, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002578}
2579
2580static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002581 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002582{
2583 struct rtable *rt = (struct rtable*)skb->dst;
2584 struct rtmsg *r;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002585 struct nlmsghdr *nlh;
Thomas Grafe3703b32006-11-27 09:27:07 -08002586 long expires;
2587 u32 id = 0, ts = 0, tsage = 0, error;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002588
2589 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2590 if (nlh == NULL)
Patrick McHardy26932562007-01-31 23:16:40 -08002591 return -EMSGSIZE;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002592
2593 r = nlmsg_data(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002594 r->rtm_family = AF_INET;
2595 r->rtm_dst_len = 32;
2596 r->rtm_src_len = 0;
2597 r->rtm_tos = rt->fl.fl4_tos;
2598 r->rtm_table = RT_TABLE_MAIN;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002599 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002600 r->rtm_type = rt->rt_type;
2601 r->rtm_scope = RT_SCOPE_UNIVERSE;
2602 r->rtm_protocol = RTPROT_UNSPEC;
2603 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2604 if (rt->rt_flags & RTCF_NOTIFY)
2605 r->rtm_flags |= RTM_F_NOTIFY;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002606
Al Viro17fb2c62006-09-26 22:15:25 -07002607 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002608
Linus Torvalds1da177e2005-04-16 15:20:36 -07002609 if (rt->fl.fl4_src) {
2610 r->rtm_src_len = 32;
Al Viro17fb2c62006-09-26 22:15:25 -07002611 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002612 }
2613 if (rt->u.dst.dev)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002614 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002615#ifdef CONFIG_NET_CLS_ROUTE
2616 if (rt->u.dst.tclassid)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002617 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002618#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002619 if (rt->fl.iif)
Al Viro17fb2c62006-09-26 22:15:25 -07002620 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002621 else if (rt->rt_src != rt->fl.fl4_src)
Al Viro17fb2c62006-09-26 22:15:25 -07002622 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002623
Linus Torvalds1da177e2005-04-16 15:20:36 -07002624 if (rt->rt_dst != rt->rt_gateway)
Al Viro17fb2c62006-09-26 22:15:25 -07002625 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002626
Linus Torvalds1da177e2005-04-16 15:20:36 -07002627 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002628 goto nla_put_failure;
2629
Thomas Grafe3703b32006-11-27 09:27:07 -08002630 error = rt->u.dst.error;
2631 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002632 if (rt->peer) {
Thomas Grafe3703b32006-11-27 09:27:07 -08002633 id = rt->peer->ip_id_count;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002634 if (rt->peer->tcp_ts_stamp) {
Thomas Grafe3703b32006-11-27 09:27:07 -08002635 ts = rt->peer->tcp_ts;
James Morris9d729f72007-03-04 16:12:44 -08002636 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002637 }
2638 }
Thomas Grafbe403ea2006-08-17 18:15:17 -07002639
Linus Torvalds1da177e2005-04-16 15:20:36 -07002640 if (rt->fl.iif) {
2641#ifdef CONFIG_IP_MROUTE
Al Viroe4485152006-09-26 22:15:01 -07002642 __be32 dst = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002643
Joe Perchesf97c1e02007-12-16 13:45:43 -08002644 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
Pavel Emelyanov586f1212007-12-16 13:32:48 -08002645 IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002646 int err = ipmr_get_route(skb, r, nowait);
2647 if (err <= 0) {
2648 if (!nowait) {
2649 if (err == 0)
2650 return 0;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002651 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002652 } else {
2653 if (err == -EMSGSIZE)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002654 goto nla_put_failure;
Thomas Grafe3703b32006-11-27 09:27:07 -08002655 error = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002656 }
2657 }
2658 } else
2659#endif
Thomas Grafbe403ea2006-08-17 18:15:17 -07002660 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002661 }
2662
Thomas Grafe3703b32006-11-27 09:27:07 -08002663 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2664 expires, error) < 0)
2665 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002666
Thomas Grafbe403ea2006-08-17 18:15:17 -07002667 return nlmsg_end(skb, nlh);
2668
2669nla_put_failure:
Patrick McHardy26932562007-01-31 23:16:40 -08002670 nlmsg_cancel(skb, nlh);
2671 return -EMSGSIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002672}
2673
Thomas Graf63f34442007-03-22 11:55:17 -07002674static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002675{
Denis V. Lunevb8542722007-12-01 00:21:31 +11002676 struct net *net = in_skb->sk->sk_net;
Thomas Grafd889ce32006-08-17 18:15:44 -07002677 struct rtmsg *rtm;
2678 struct nlattr *tb[RTA_MAX+1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07002679 struct rtable *rt = NULL;
Al Viro9e12bb22006-09-26 21:25:20 -07002680 __be32 dst = 0;
2681 __be32 src = 0;
2682 u32 iif;
Thomas Grafd889ce32006-08-17 18:15:44 -07002683 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002684 struct sk_buff *skb;
2685
Denis V. Lunevb8542722007-12-01 00:21:31 +11002686 if (net != &init_net)
2687 return -EINVAL;
2688
Thomas Grafd889ce32006-08-17 18:15:44 -07002689 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2690 if (err < 0)
2691 goto errout;
2692
2693 rtm = nlmsg_data(nlh);
2694
Linus Torvalds1da177e2005-04-16 15:20:36 -07002695 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
Thomas Grafd889ce32006-08-17 18:15:44 -07002696 if (skb == NULL) {
2697 err = -ENOBUFS;
2698 goto errout;
2699 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002700
2701 /* Reserve room for dummy headers, this skb can pass
2702 through good chunk of routing engine.
2703 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07002704 skb_reset_mac_header(skb);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07002705 skb_reset_network_header(skb);
Stephen Hemmingerd2c962b2006-04-17 17:27:11 -07002706
2707 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07002708 ip_hdr(skb)->protocol = IPPROTO_ICMP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002709 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2710
Al Viro17fb2c62006-09-26 22:15:25 -07002711 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2712 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
Thomas Grafd889ce32006-08-17 18:15:44 -07002713 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002714
2715 if (iif) {
Thomas Grafd889ce32006-08-17 18:15:44 -07002716 struct net_device *dev;
2717
Eric W. Biederman881d9662007-09-17 11:56:21 -07002718 dev = __dev_get_by_index(&init_net, iif);
Thomas Grafd889ce32006-08-17 18:15:44 -07002719 if (dev == NULL) {
2720 err = -ENODEV;
2721 goto errout_free;
2722 }
2723
Linus Torvalds1da177e2005-04-16 15:20:36 -07002724 skb->protocol = htons(ETH_P_IP);
2725 skb->dev = dev;
2726 local_bh_disable();
2727 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2728 local_bh_enable();
Thomas Grafd889ce32006-08-17 18:15:44 -07002729
2730 rt = (struct rtable*) skb->dst;
2731 if (err == 0 && rt->u.dst.error)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002732 err = -rt->u.dst.error;
2733 } else {
Thomas Grafd889ce32006-08-17 18:15:44 -07002734 struct flowi fl = {
2735 .nl_u = {
2736 .ip4_u = {
2737 .daddr = dst,
2738 .saddr = src,
2739 .tos = rtm->rtm_tos,
2740 },
2741 },
2742 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2743 };
Denis V. Lunevf2063512008-01-22 22:07:34 -08002744 err = ip_route_output_key(&init_net, &rt, &fl);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002745 }
Thomas Grafd889ce32006-08-17 18:15:44 -07002746
Linus Torvalds1da177e2005-04-16 15:20:36 -07002747 if (err)
Thomas Grafd889ce32006-08-17 18:15:44 -07002748 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002749
2750 skb->dst = &rt->u.dst;
2751 if (rtm->rtm_flags & RTM_F_NOTIFY)
2752 rt->rt_flags |= RTCF_NOTIFY;
2753
Linus Torvalds1da177e2005-04-16 15:20:36 -07002754 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002755 RTM_NEWROUTE, 0, 0);
Thomas Grafd889ce32006-08-17 18:15:44 -07002756 if (err <= 0)
2757 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002758
Denis V. Lunev97c53ca2007-11-19 22:26:51 -08002759 err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid);
Thomas Grafd889ce32006-08-17 18:15:44 -07002760errout:
Thomas Graf2942e902006-08-15 00:30:25 -07002761 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002762
Thomas Grafd889ce32006-08-17 18:15:44 -07002763errout_free:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002764 kfree_skb(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07002765 goto errout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002766}
2767
2768int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2769{
2770 struct rtable *rt;
2771 int h, s_h;
2772 int idx, s_idx;
2773
2774 s_h = cb->args[0];
Eric Dumazetd8c92832008-01-07 21:52:14 -08002775 if (s_h < 0)
2776 s_h = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002777 s_idx = idx = cb->args[1];
Eric Dumazetd8c92832008-01-07 21:52:14 -08002778 for (h = s_h; h <= rt_hash_mask; h++) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002779 rcu_read_lock_bh();
2780 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08002781 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002782 if (idx < s_idx)
2783 continue;
2784 skb->dst = dst_clone(&rt->u.dst);
2785 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002786 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002787 1, NLM_F_MULTI) <= 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002788 dst_release(xchg(&skb->dst, NULL));
2789 rcu_read_unlock_bh();
2790 goto done;
2791 }
2792 dst_release(xchg(&skb->dst, NULL));
2793 }
2794 rcu_read_unlock_bh();
Eric Dumazetd8c92832008-01-07 21:52:14 -08002795 s_idx = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002796 }
2797
2798done:
2799 cb->args[0] = h;
2800 cb->args[1] = idx;
2801 return skb->len;
2802}
2803
2804void ip_rt_multicast_event(struct in_device *in_dev)
2805{
2806 rt_cache_flush(0);
2807}
2808
2809#ifdef CONFIG_SYSCTL
2810static int flush_delay;
2811
2812static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2813 struct file *filp, void __user *buffer,
2814 size_t *lenp, loff_t *ppos)
2815{
2816 if (write) {
2817 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2818 rt_cache_flush(flush_delay);
2819 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002820 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002821
2822 return -EINVAL;
2823}
2824
2825static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2826 int __user *name,
2827 int nlen,
2828 void __user *oldval,
2829 size_t __user *oldlenp,
2830 void __user *newval,
Alexey Dobriyan1f29bcd2006-12-10 02:19:10 -08002831 size_t newlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002832{
2833 int delay;
2834 if (newlen != sizeof(int))
2835 return -EINVAL;
2836 if (get_user(delay, (int __user *)newval))
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002837 return -EFAULT;
2838 rt_cache_flush(delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002839 return 0;
2840}
2841
2842ctl_table ipv4_route_table[] = {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002843 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002844 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2845 .procname = "flush",
2846 .data = &flush_delay,
2847 .maxlen = sizeof(int),
Dave Jones7e3e0362005-04-28 12:11:03 -07002848 .mode = 0200,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002849 .proc_handler = &ipv4_sysctl_rtcache_flush,
2850 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2851 },
2852 {
2853 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY,
2854 .procname = "min_delay",
2855 .data = &ip_rt_min_delay,
2856 .maxlen = sizeof(int),
2857 .mode = 0644,
2858 .proc_handler = &proc_dointvec_jiffies,
2859 .strategy = &sysctl_jiffies,
2860 },
2861 {
2862 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY,
2863 .procname = "max_delay",
2864 .data = &ip_rt_max_delay,
2865 .maxlen = sizeof(int),
2866 .mode = 0644,
2867 .proc_handler = &proc_dointvec_jiffies,
2868 .strategy = &sysctl_jiffies,
2869 },
2870 {
2871 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2872 .procname = "gc_thresh",
2873 .data = &ipv4_dst_ops.gc_thresh,
2874 .maxlen = sizeof(int),
2875 .mode = 0644,
2876 .proc_handler = &proc_dointvec,
2877 },
2878 {
2879 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2880 .procname = "max_size",
2881 .data = &ip_rt_max_size,
2882 .maxlen = sizeof(int),
2883 .mode = 0644,
2884 .proc_handler = &proc_dointvec,
2885 },
2886 {
2887 /* Deprecated. Use gc_min_interval_ms */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002888
Linus Torvalds1da177e2005-04-16 15:20:36 -07002889 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2890 .procname = "gc_min_interval",
2891 .data = &ip_rt_gc_min_interval,
2892 .maxlen = sizeof(int),
2893 .mode = 0644,
2894 .proc_handler = &proc_dointvec_jiffies,
2895 .strategy = &sysctl_jiffies,
2896 },
2897 {
2898 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2899 .procname = "gc_min_interval_ms",
2900 .data = &ip_rt_gc_min_interval,
2901 .maxlen = sizeof(int),
2902 .mode = 0644,
2903 .proc_handler = &proc_dointvec_ms_jiffies,
2904 .strategy = &sysctl_ms_jiffies,
2905 },
2906 {
2907 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2908 .procname = "gc_timeout",
2909 .data = &ip_rt_gc_timeout,
2910 .maxlen = sizeof(int),
2911 .mode = 0644,
2912 .proc_handler = &proc_dointvec_jiffies,
2913 .strategy = &sysctl_jiffies,
2914 },
2915 {
2916 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2917 .procname = "gc_interval",
2918 .data = &ip_rt_gc_interval,
2919 .maxlen = sizeof(int),
2920 .mode = 0644,
2921 .proc_handler = &proc_dointvec_jiffies,
2922 .strategy = &sysctl_jiffies,
2923 },
2924 {
2925 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2926 .procname = "redirect_load",
2927 .data = &ip_rt_redirect_load,
2928 .maxlen = sizeof(int),
2929 .mode = 0644,
2930 .proc_handler = &proc_dointvec,
2931 },
2932 {
2933 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2934 .procname = "redirect_number",
2935 .data = &ip_rt_redirect_number,
2936 .maxlen = sizeof(int),
2937 .mode = 0644,
2938 .proc_handler = &proc_dointvec,
2939 },
2940 {
2941 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2942 .procname = "redirect_silence",
2943 .data = &ip_rt_redirect_silence,
2944 .maxlen = sizeof(int),
2945 .mode = 0644,
2946 .proc_handler = &proc_dointvec,
2947 },
2948 {
2949 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2950 .procname = "error_cost",
2951 .data = &ip_rt_error_cost,
2952 .maxlen = sizeof(int),
2953 .mode = 0644,
2954 .proc_handler = &proc_dointvec,
2955 },
2956 {
2957 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2958 .procname = "error_burst",
2959 .data = &ip_rt_error_burst,
2960 .maxlen = sizeof(int),
2961 .mode = 0644,
2962 .proc_handler = &proc_dointvec,
2963 },
2964 {
2965 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
2966 .procname = "gc_elasticity",
2967 .data = &ip_rt_gc_elasticity,
2968 .maxlen = sizeof(int),
2969 .mode = 0644,
2970 .proc_handler = &proc_dointvec,
2971 },
2972 {
2973 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
2974 .procname = "mtu_expires",
2975 .data = &ip_rt_mtu_expires,
2976 .maxlen = sizeof(int),
2977 .mode = 0644,
2978 .proc_handler = &proc_dointvec_jiffies,
2979 .strategy = &sysctl_jiffies,
2980 },
2981 {
2982 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
2983 .procname = "min_pmtu",
2984 .data = &ip_rt_min_pmtu,
2985 .maxlen = sizeof(int),
2986 .mode = 0644,
2987 .proc_handler = &proc_dointvec,
2988 },
2989 {
2990 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
2991 .procname = "min_adv_mss",
2992 .data = &ip_rt_min_advmss,
2993 .maxlen = sizeof(int),
2994 .mode = 0644,
2995 .proc_handler = &proc_dointvec,
2996 },
2997 {
2998 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
2999 .procname = "secret_interval",
3000 .data = &ip_rt_secret_interval,
3001 .maxlen = sizeof(int),
3002 .mode = 0644,
3003 .proc_handler = &proc_dointvec_jiffies,
3004 .strategy = &sysctl_jiffies,
3005 },
3006 { .ctl_name = 0 }
3007};
3008#endif
3009
3010#ifdef CONFIG_NET_CLS_ROUTE
Eric Dumazet8dbde282007-11-16 03:32:10 -08003011struct ip_rt_acct *ip_rt_acct __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003012#endif /* CONFIG_NET_CLS_ROUTE */
3013
3014static __initdata unsigned long rhash_entries;
3015static int __init set_rhash_entries(char *str)
3016{
3017 if (!str)
3018 return 0;
3019 rhash_entries = simple_strtoul(str, &str, 0);
3020 return 1;
3021}
3022__setup("rhash_entries=", set_rhash_entries);
3023
3024int __init ip_rt_init(void)
3025{
Eric Dumazet424c4b72005-07-05 14:58:19 -07003026 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003027
3028 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3029 (jiffies ^ (jiffies >> 7)));
3030
3031#ifdef CONFIG_NET_CLS_ROUTE
Eric Dumazet8dbde282007-11-16 03:32:10 -08003032 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003033 if (!ip_rt_acct)
3034 panic("IP: failed to allocate ip_rt_acct\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003035#endif
3036
Alexey Dobriyane5d679f2006-08-26 19:25:52 -07003037 ipv4_dst_ops.kmem_cachep =
3038 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09003039 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003040
David S. Miller14e50e52007-05-24 18:17:54 -07003041 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3042
Eric Dumazet424c4b72005-07-05 14:58:19 -07003043 rt_hash_table = (struct rt_hash_bucket *)
3044 alloc_large_system_hash("IP route cache",
3045 sizeof(struct rt_hash_bucket),
3046 rhash_entries,
3047 (num_physpages >= 128 * 1024) ?
Mike Stroyan18955cf2005-11-29 16:12:55 -08003048 15 : 17,
Kirill Korotaev8d1502d2006-08-07 20:44:22 -07003049 0,
Eric Dumazet424c4b72005-07-05 14:58:19 -07003050 &rt_hash_log,
3051 &rt_hash_mask,
3052 0);
Eric Dumazet22c047c2005-07-05 14:55:24 -07003053 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3054 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003055
3056 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3057 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3058
Linus Torvalds1da177e2005-04-16 15:20:36 -07003059 devinet_init();
3060 ip_fib_init();
3061
Pavel Emelyanovb24b8a22008-01-23 21:20:07 -08003062 setup_timer(&rt_flush_timer, rt_run_flush, 0);
3063 setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003064
3065 /* All the timers, started at system startup tend
3066 to synchronize. Perturb it a bit.
3067 */
Eric Dumazet39c90ec2007-09-15 10:55:54 -07003068 schedule_delayed_work(&expires_work,
3069 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003070
3071 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3072 ip_rt_secret_interval;
3073 add_timer(&rt_secret_timer);
3074
Pavel Emelyanov107f1632007-12-05 21:14:28 -08003075 if (ip_rt_proc_init(&init_net))
3076 printk(KERN_ERR "Unable to create route proc files\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003077#ifdef CONFIG_XFRM
3078 xfrm_init();
3079 xfrm4_init();
3080#endif
Thomas Graf63f34442007-03-22 11:55:17 -07003081 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3082
Linus Torvalds1da177e2005-04-16 15:20:36 -07003083 return rc;
3084}
3085
3086EXPORT_SYMBOL(__ip_select_ident);
3087EXPORT_SYMBOL(ip_route_input);
3088EXPORT_SYMBOL(ip_route_output_key);