blob: d413b57be9b35a38bfab24d16effbf9223a1a991 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
Jesper Juhl02c30a82005-05-05 16:16:16 -07008 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -07009 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090021 * Alan Cox : Super /proc >4K
Linus Torvalds1da177e2005-04-16 15:20:36 -070022 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090039 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070055 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Ilia Sotnikovcef26852006-03-25 01:38:55 -080056 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
Linus Torvalds1da177e2005-04-16 15:20:36 -070058 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
Linus Torvalds1da177e2005-04-16 15:20:36 -070065#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070071#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070072#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070073#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
Eric Dumazet39c90ec2007-09-15 10:55:54 -070082#include <linux/workqueue.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070083#include <linux/skbuff.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070084#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
Herbert Xu352e5122007-11-13 21:34:06 -080093#include <net/dst.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020094#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070095#include <net/protocol.h>
96#include <net/ip.h>
97#include <net/route.h>
98#include <net/inetpeer.h>
99#include <net/sock.h>
100#include <net/ip_fib.h>
101#include <net/arp.h>
102#include <net/tcp.h>
103#include <net/icmp.h>
104#include <net/xfrm.h>
Tom Tucker8d717402006-07-30 20:43:36 -0700105#include <net/netevent.h>
Thomas Graf63f34442007-03-22 11:55:17 -0700106#include <net/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700107#ifdef CONFIG_SYSCTL
108#include <linux/sysctl.h>
109#endif
110
111#define RT_FL_TOS(oldflp) \
112 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113
114#define IP_MAX_MTU 0xFFF0
115
116#define RT_GC_TIMEOUT (300*HZ)
117
Linus Torvalds1da177e2005-04-16 15:20:36 -0700118static int ip_rt_max_size;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700119static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
120static int ip_rt_gc_interval __read_mostly = 60 * HZ;
121static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
122static int ip_rt_redirect_number __read_mostly = 9;
123static int ip_rt_redirect_load __read_mostly = HZ / 50;
124static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
125static int ip_rt_error_cost __read_mostly = HZ;
126static int ip_rt_error_burst __read_mostly = 5 * HZ;
127static int ip_rt_gc_elasticity __read_mostly = 8;
128static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
129static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
130static int ip_rt_min_advmss __read_mostly = 256;
131static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
Neil Horman1080d702008-10-27 12:28:25 -0700132static int rt_chain_length_max __read_mostly = 20;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700133
Eric Dumazet125bb8f2009-06-11 20:10:07 +0000134static struct delayed_work expires_work;
135static unsigned long expires_ljiffies;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700136
137/*
138 * Interface to generic destination cache.
139 */
140
141static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142static void ipv4_dst_destroy(struct dst_entry *dst);
143static void ipv4_dst_ifdown(struct dst_entry *dst,
144 struct net_device *dev, int how);
145static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146static void ipv4_link_failure(struct sk_buff *skb);
147static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
Daniel Lezcano569d3642008-01-18 03:56:57 -0800148static int rt_garbage_collect(struct dst_ops *ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700149
150
151static struct dst_ops ipv4_dst_ops = {
152 .family = AF_INET,
Harvey Harrison09640e62009-02-01 00:45:17 -0800153 .protocol = cpu_to_be16(ETH_P_IP),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700154 .gc = rt_garbage_collect,
155 .check = ipv4_dst_check,
156 .destroy = ipv4_dst_destroy,
157 .ifdown = ipv4_dst_ifdown,
158 .negative_advice = ipv4_negative_advice,
159 .link_failure = ipv4_link_failure,
160 .update_pmtu = ip_rt_update_pmtu,
Herbert Xu1ac06e02008-05-20 14:32:14 -0700161 .local_out = __ip_local_out,
Eric Dumazete2422972008-01-30 20:07:45 -0800162 .entries = ATOMIC_INIT(0),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700163};
164
165#define ECN_OR_COST(class) TC_PRIO_##class
166
Philippe De Muyter4839c522007-07-09 15:32:57 -0700167const __u8 ip_tos2prio[16] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700168 TC_PRIO_BESTEFFORT,
169 ECN_OR_COST(FILLER),
170 TC_PRIO_BESTEFFORT,
171 ECN_OR_COST(BESTEFFORT),
172 TC_PRIO_BULK,
173 ECN_OR_COST(BULK),
174 TC_PRIO_BULK,
175 ECN_OR_COST(BULK),
176 TC_PRIO_INTERACTIVE,
177 ECN_OR_COST(INTERACTIVE),
178 TC_PRIO_INTERACTIVE,
179 ECN_OR_COST(INTERACTIVE),
180 TC_PRIO_INTERACTIVE_BULK,
181 ECN_OR_COST(INTERACTIVE_BULK),
182 TC_PRIO_INTERACTIVE_BULK,
183 ECN_OR_COST(INTERACTIVE_BULK)
184};
185
186
187/*
188 * Route cache.
189 */
190
191/* The locking scheme is rather straight forward:
192 *
193 * 1) Read-Copy Update protects the buckets of the central route hash.
194 * 2) Only writers remove entries, and they hold the lock
195 * as they look at rtable reference counts.
196 * 3) Only readers acquire references to rtable entries,
197 * they do so with atomic increments and with the
198 * lock held.
199 */
200
201struct rt_hash_bucket {
202 struct rtable *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700203};
Neil Horman1080d702008-10-27 12:28:25 -0700204
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700205#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
206 defined(CONFIG_PROVE_LOCKING)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700207/*
208 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
209 * The size of this table is a power of two and depends on the number of CPUS.
Ingo Molnar62051202006-07-03 00:24:59 -0700210 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700211 */
Ingo Molnar62051202006-07-03 00:24:59 -0700212#ifdef CONFIG_LOCKDEP
213# define RT_HASH_LOCK_SZ 256
Eric Dumazet22c047c2005-07-05 14:55:24 -0700214#else
Ingo Molnar62051202006-07-03 00:24:59 -0700215# if NR_CPUS >= 32
216# define RT_HASH_LOCK_SZ 4096
217# elif NR_CPUS >= 16
218# define RT_HASH_LOCK_SZ 2048
219# elif NR_CPUS >= 8
220# define RT_HASH_LOCK_SZ 1024
221# elif NR_CPUS >= 4
222# define RT_HASH_LOCK_SZ 512
223# else
224# define RT_HASH_LOCK_SZ 256
225# endif
Eric Dumazet22c047c2005-07-05 14:55:24 -0700226#endif
227
228static spinlock_t *rt_hash_locks;
229# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800230
231static __init void rt_hash_lock_init(void)
232{
233 int i;
234
235 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
236 GFP_KERNEL);
237 if (!rt_hash_locks)
238 panic("IP: failed to allocate rt_hash_locks\n");
239
240 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
241 spin_lock_init(&rt_hash_locks[i]);
242}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700243#else
244# define rt_hash_lock_addr(slot) NULL
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800245
246static inline void rt_hash_lock_init(void)
247{
248}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700249#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700250
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700251static struct rt_hash_bucket *rt_hash_table __read_mostly;
252static unsigned rt_hash_mask __read_mostly;
253static unsigned int rt_hash_log __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700254
Eric Dumazet2f970d82006-01-17 02:54:36 -0800255static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
Andrew Mortondbd29152006-01-17 21:58:01 -0800256#define RT_CACHE_STAT_INC(field) \
Paul Mackerrasbfe5d832006-06-25 05:47:14 -0700257 (__raw_get_cpu_var(rt_cache_stat).field++)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700258
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700259static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
260 int genid)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700261{
Stephen Hemminger1294fc42008-04-10 01:54:01 -0700262 return jhash_3words((__force u32)(__be32)(daddr),
263 (__force u32)(__be32)(saddr),
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700264 idx, genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800265 & rt_hash_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700266}
267
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700268static inline int rt_genid(struct net *net)
269{
270 return atomic_read(&net->ipv4.rt_genid);
271}
272
Linus Torvalds1da177e2005-04-16 15:20:36 -0700273#ifdef CONFIG_PROC_FS
274struct rt_cache_iter_state {
Denis V. Luneva75e9362008-02-28 20:50:55 -0800275 struct seq_net_private p;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700276 int bucket;
Eric Dumazet29e75252008-01-31 17:05:09 -0800277 int genid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700278};
279
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900280static struct rtable *rt_cache_get_first(struct seq_file *seq)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700281{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900282 struct rt_cache_iter_state *st = seq->private;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700283 struct rtable *r = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700284
285 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
Eric Dumazeta6272662008-08-28 01:11:25 -0700286 if (!rt_hash_table[st->bucket].chain)
287 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700288 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -0800289 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Eric Dumazet29e75252008-01-31 17:05:09 -0800290 while (r) {
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900291 if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
Denis V. Luneva75e9362008-02-28 20:50:55 -0800292 r->rt_genid == st->genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800293 return r;
Paul E. McKenneya898def2010-02-22 17:04:49 -0800294 r = rcu_dereference_bh(r->u.dst.rt_next);
Eric Dumazet29e75252008-01-31 17:05:09 -0800295 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700296 rcu_read_unlock_bh();
297 }
Eric Dumazet29e75252008-01-31 17:05:09 -0800298 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700299}
300
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900301static struct rtable *__rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800302 struct rtable *r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700303{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900304 struct rt_cache_iter_state *st = seq->private;
Eric Dumazeta6272662008-08-28 01:11:25 -0700305
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800306 r = r->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700307 while (!r) {
308 rcu_read_unlock_bh();
Eric Dumazeta6272662008-08-28 01:11:25 -0700309 do {
310 if (--st->bucket < 0)
311 return NULL;
312 } while (!rt_hash_table[st->bucket].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700313 rcu_read_lock_bh();
314 r = rt_hash_table[st->bucket].chain;
315 }
Paul E. McKenneya898def2010-02-22 17:04:49 -0800316 return rcu_dereference_bh(r);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700317}
318
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900319static struct rtable *rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800320 struct rtable *r)
321{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900322 struct rt_cache_iter_state *st = seq->private;
323 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
324 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
Denis V. Luneva75e9362008-02-28 20:50:55 -0800325 continue;
Denis V. Lunev642d6312008-02-28 20:50:33 -0800326 if (r->rt_genid == st->genid)
327 break;
328 }
329 return r;
330}
331
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900332static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700333{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900334 struct rtable *r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700335
336 if (r)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900337 while (pos && (r = rt_cache_get_next(seq, r)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700338 --pos;
339 return pos ? NULL : r;
340}
341
342static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
343{
Eric Dumazet29e75252008-01-31 17:05:09 -0800344 struct rt_cache_iter_state *st = seq->private;
Eric Dumazet29e75252008-01-31 17:05:09 -0800345 if (*pos)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900346 return rt_cache_get_idx(seq, *pos - 1);
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700347 st->genid = rt_genid(seq_file_net(seq));
Eric Dumazet29e75252008-01-31 17:05:09 -0800348 return SEQ_START_TOKEN;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700349}
350
351static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
352{
Eric Dumazet29e75252008-01-31 17:05:09 -0800353 struct rtable *r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700354
355 if (v == SEQ_START_TOKEN)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900356 r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700357 else
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900358 r = rt_cache_get_next(seq, v);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700359 ++*pos;
360 return r;
361}
362
363static void rt_cache_seq_stop(struct seq_file *seq, void *v)
364{
365 if (v && v != SEQ_START_TOKEN)
366 rcu_read_unlock_bh();
367}
368
369static int rt_cache_seq_show(struct seq_file *seq, void *v)
370{
371 if (v == SEQ_START_TOKEN)
372 seq_printf(seq, "%-127s\n",
373 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
374 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
375 "HHUptod\tSpecDst");
376 else {
377 struct rtable *r = v;
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700378 int len;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700379
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700380 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
381 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700382 r->u.dst.dev ? r->u.dst.dev->name : "*",
383 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
384 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
385 r->u.dst.__use, 0, (unsigned long)r->rt_src,
386 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
387 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
388 dst_metric(&r->u.dst, RTAX_WINDOW),
389 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
390 dst_metric(&r->u.dst, RTAX_RTTVAR)),
391 r->fl.fl4_tos,
392 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
393 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
394 dev_queue_xmit) : 0,
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700395 r->rt_spec_dst, &len);
396
397 seq_printf(seq, "%*s\n", 127 - len, "");
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900398 }
399 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700400}
401
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700402static const struct seq_operations rt_cache_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700403 .start = rt_cache_seq_start,
404 .next = rt_cache_seq_next,
405 .stop = rt_cache_seq_stop,
406 .show = rt_cache_seq_show,
407};
408
409static int rt_cache_seq_open(struct inode *inode, struct file *file)
410{
Denis V. Luneva75e9362008-02-28 20:50:55 -0800411 return seq_open_net(inode, file, &rt_cache_seq_ops,
Pavel Emelyanovcf7732e2007-10-10 02:29:29 -0700412 sizeof(struct rt_cache_iter_state));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700413}
414
Arjan van de Ven9a321442007-02-12 00:55:35 -0800415static const struct file_operations rt_cache_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700416 .owner = THIS_MODULE,
417 .open = rt_cache_seq_open,
418 .read = seq_read,
419 .llseek = seq_lseek,
Denis V. Luneva75e9362008-02-28 20:50:55 -0800420 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700421};
422
423
424static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
425{
426 int cpu;
427
428 if (*pos == 0)
429 return SEQ_START_TOKEN;
430
Rusty Russell0f23174a2008-12-29 12:23:42 +0000431 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700432 if (!cpu_possible(cpu))
433 continue;
434 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800435 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700436 }
437 return NULL;
438}
439
440static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
441{
442 int cpu;
443
Rusty Russell0f23174a2008-12-29 12:23:42 +0000444 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700445 if (!cpu_possible(cpu))
446 continue;
447 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800448 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700449 }
450 return NULL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900451
Linus Torvalds1da177e2005-04-16 15:20:36 -0700452}
453
454static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
455{
456
457}
458
459static int rt_cpu_seq_show(struct seq_file *seq, void *v)
460{
461 struct rt_cache_stat *st = v;
462
463 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700464 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700465 return 0;
466 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900467
Linus Torvalds1da177e2005-04-16 15:20:36 -0700468 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
469 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
470 atomic_read(&ipv4_dst_ops.entries),
471 st->in_hit,
472 st->in_slow_tot,
473 st->in_slow_mc,
474 st->in_no_route,
475 st->in_brd,
476 st->in_martian_dst,
477 st->in_martian_src,
478
479 st->out_hit,
480 st->out_slow_tot,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900481 st->out_slow_mc,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700482
483 st->gc_total,
484 st->gc_ignored,
485 st->gc_goal_miss,
486 st->gc_dst_overflow,
487 st->in_hlist_search,
488 st->out_hlist_search
489 );
490 return 0;
491}
492
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700493static const struct seq_operations rt_cpu_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700494 .start = rt_cpu_seq_start,
495 .next = rt_cpu_seq_next,
496 .stop = rt_cpu_seq_stop,
497 .show = rt_cpu_seq_show,
498};
499
500
501static int rt_cpu_seq_open(struct inode *inode, struct file *file)
502{
503 return seq_open(file, &rt_cpu_seq_ops);
504}
505
Arjan van de Ven9a321442007-02-12 00:55:35 -0800506static const struct file_operations rt_cpu_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700507 .owner = THIS_MODULE,
508 .open = rt_cpu_seq_open,
509 .read = seq_read,
510 .llseek = seq_lseek,
511 .release = seq_release,
512};
513
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800514#ifdef CONFIG_NET_CLS_ROUTE
Alexey Dobriyana661c412009-11-25 15:40:35 -0800515static int rt_acct_proc_show(struct seq_file *m, void *v)
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800516{
Alexey Dobriyana661c412009-11-25 15:40:35 -0800517 struct ip_rt_acct *dst, *src;
518 unsigned int i, j;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800519
Alexey Dobriyana661c412009-11-25 15:40:35 -0800520 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
521 if (!dst)
522 return -ENOMEM;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800523
Alexey Dobriyana661c412009-11-25 15:40:35 -0800524 for_each_possible_cpu(i) {
525 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
526 for (j = 0; j < 256; j++) {
527 dst[j].o_bytes += src[j].o_bytes;
528 dst[j].o_packets += src[j].o_packets;
529 dst[j].i_bytes += src[j].i_bytes;
530 dst[j].i_packets += src[j].i_packets;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800531 }
532 }
Alexey Dobriyana661c412009-11-25 15:40:35 -0800533
534 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
535 kfree(dst);
536 return 0;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800537}
Alexey Dobriyana661c412009-11-25 15:40:35 -0800538
539static int rt_acct_proc_open(struct inode *inode, struct file *file)
540{
541 return single_open(file, rt_acct_proc_show, NULL);
542}
543
544static const struct file_operations rt_acct_proc_fops = {
545 .owner = THIS_MODULE,
546 .open = rt_acct_proc_open,
547 .read = seq_read,
548 .llseek = seq_lseek,
549 .release = single_release,
550};
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800551#endif
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800552
Denis V. Lunev73b38712008-02-28 20:51:18 -0800553static int __net_init ip_rt_do_proc_init(struct net *net)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800554{
555 struct proc_dir_entry *pde;
556
557 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
558 &rt_cache_seq_fops);
559 if (!pde)
560 goto err1;
561
Wang Chen77020722008-02-28 14:14:25 -0800562 pde = proc_create("rt_cache", S_IRUGO,
563 net->proc_net_stat, &rt_cpu_seq_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800564 if (!pde)
565 goto err2;
566
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800567#ifdef CONFIG_NET_CLS_ROUTE
Alexey Dobriyana661c412009-11-25 15:40:35 -0800568 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800569 if (!pde)
570 goto err3;
571#endif
572 return 0;
573
574#ifdef CONFIG_NET_CLS_ROUTE
575err3:
576 remove_proc_entry("rt_cache", net->proc_net_stat);
577#endif
578err2:
579 remove_proc_entry("rt_cache", net->proc_net);
580err1:
581 return -ENOMEM;
582}
Denis V. Lunev73b38712008-02-28 20:51:18 -0800583
584static void __net_exit ip_rt_do_proc_exit(struct net *net)
585{
586 remove_proc_entry("rt_cache", net->proc_net_stat);
587 remove_proc_entry("rt_cache", net->proc_net);
Alexey Dobriyan0a931ac2010-01-17 03:32:50 +0000588#ifdef CONFIG_NET_CLS_ROUTE
Denis V. Lunev73b38712008-02-28 20:51:18 -0800589 remove_proc_entry("rt_acct", net->proc_net);
Alexey Dobriyan0a931ac2010-01-17 03:32:50 +0000590#endif
Denis V. Lunev73b38712008-02-28 20:51:18 -0800591}
592
593static struct pernet_operations ip_rt_proc_ops __net_initdata = {
594 .init = ip_rt_do_proc_init,
595 .exit = ip_rt_do_proc_exit,
596};
597
598static int __init ip_rt_proc_init(void)
599{
600 return register_pernet_subsys(&ip_rt_proc_ops);
601}
602
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800603#else
Denis V. Lunev73b38712008-02-28 20:51:18 -0800604static inline int ip_rt_proc_init(void)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800605{
606 return 0;
607}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700608#endif /* CONFIG_PROC_FS */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900609
Stephen Hemminger5969f712008-04-10 01:52:09 -0700610static inline void rt_free(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700611{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700612 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
613}
614
Stephen Hemminger5969f712008-04-10 01:52:09 -0700615static inline void rt_drop(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700616{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700617 ip_rt_put(rt);
618 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
619}
620
Stephen Hemminger5969f712008-04-10 01:52:09 -0700621static inline int rt_fast_clean(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700622{
623 /* Kill broadcast/multicast entries very aggresively, if they
624 collide in hash table with more useful entries */
625 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800626 rth->fl.iif && rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700627}
628
Stephen Hemminger5969f712008-04-10 01:52:09 -0700629static inline int rt_valuable(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700630{
631 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
632 rth->u.dst.expires;
633}
634
635static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
636{
637 unsigned long age;
638 int ret = 0;
639
640 if (atomic_read(&rth->u.dst.__refcnt))
641 goto out;
642
643 ret = 1;
644 if (rth->u.dst.expires &&
645 time_after_eq(jiffies, rth->u.dst.expires))
646 goto out;
647
648 age = jiffies - rth->u.dst.lastuse;
649 ret = 0;
650 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
651 (age <= tmo2 && rt_valuable(rth)))
652 goto out;
653 ret = 1;
654out: return ret;
655}
656
657/* Bits of score are:
658 * 31: very valuable
659 * 30: not quite useless
660 * 29..0: usage counter
661 */
662static inline u32 rt_score(struct rtable *rt)
663{
664 u32 score = jiffies - rt->u.dst.lastuse;
665
666 score = ~score & ~(3<<30);
667
668 if (rt_valuable(rt))
669 score |= (1<<31);
670
671 if (!rt->fl.iif ||
672 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
673 score |= (1<<30);
674
675 return score;
676}
677
Neil Horman1080d702008-10-27 12:28:25 -0700678static inline bool rt_caching(const struct net *net)
679{
680 return net->ipv4.current_rt_cache_rebuild_count <=
681 net->ipv4.sysctl_rt_cache_rebuild_count;
682}
683
684static inline bool compare_hash_inputs(const struct flowi *fl1,
685 const struct flowi *fl2)
686{
687 return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
688 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
689 (fl1->iif ^ fl2->iif)) == 0);
690}
691
Linus Torvalds1da177e2005-04-16 15:20:36 -0700692static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
693{
Al Viro714e85b2006-11-14 20:51:49 -0800694 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
695 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
Thomas Graf47dcf0c2006-11-09 15:20:38 -0800696 (fl1->mark ^ fl2->mark) |
David S. Miller8238b212006-10-12 00:49:15 -0700697 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
698 *(u16 *)&fl2->nl_u.ip4_u.tos) |
699 (fl1->oif ^ fl2->oif) |
700 (fl1->iif ^ fl2->iif)) == 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700701}
702
Denis V. Lunevb5921912008-01-22 23:50:25 -0800703static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
704{
Octavian Purdila09ad9bc2009-11-25 15:14:13 -0800705 return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev));
Denis V. Lunevb5921912008-01-22 23:50:25 -0800706}
707
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700708static inline int rt_is_expired(struct rtable *rth)
709{
710 return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
711}
712
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800713/*
714 * Perform a full scan of hash table and free all entries.
715 * Can be called by a softirq or a process.
716 * In the later case, we want to be reschedule if necessary
717 */
718static void rt_do_flush(int process_context)
719{
720 unsigned int i;
721 struct rtable *rth, *next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700722 struct rtable * tail;
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800723
724 for (i = 0; i <= rt_hash_mask; i++) {
725 if (process_context && need_resched())
726 cond_resched();
727 rth = rt_hash_table[i].chain;
728 if (!rth)
729 continue;
730
731 spin_lock_bh(rt_hash_lock_addr(i));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700732#ifdef CONFIG_NET_NS
733 {
734 struct rtable ** prev, * p;
735
736 rth = rt_hash_table[i].chain;
737
738 /* defer releasing the head of the list after spin_unlock */
739 for (tail = rth; tail; tail = tail->u.dst.rt_next)
740 if (!rt_is_expired(tail))
741 break;
742 if (rth != tail)
743 rt_hash_table[i].chain = tail;
744
745 /* call rt_free on entries after the tail requiring flush */
746 prev = &rt_hash_table[i].chain;
747 for (p = *prev; p; p = next) {
748 next = p->u.dst.rt_next;
749 if (!rt_is_expired(p)) {
750 prev = &p->u.dst.rt_next;
751 } else {
752 *prev = next;
753 rt_free(p);
754 }
755 }
756 }
757#else
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800758 rth = rt_hash_table[i].chain;
759 rt_hash_table[i].chain = NULL;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700760 tail = NULL;
761#endif
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800762 spin_unlock_bh(rt_hash_lock_addr(i));
763
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700764 for (; rth != tail; rth = next) {
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800765 next = rth->u.dst.rt_next;
766 rt_free(rth);
767 }
768 }
769}
770
Neil Horman1080d702008-10-27 12:28:25 -0700771/*
772 * While freeing expired entries, we compute average chain length
773 * and standard deviation, using fixed-point arithmetic.
774 * This to have an estimation of rt_chain_length_max
775 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
776 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
777 */
778
779#define FRACT_BITS 3
780#define ONE (1UL << FRACT_BITS)
781
Eric Dumazet98376382010-03-08 03:20:00 +0000782/*
783 * Given a hash chain and an item in this hash chain,
784 * find if a previous entry has the same hash_inputs
785 * (but differs on tos, mark or oif)
786 * Returns 0 if an alias is found.
787 * Returns ONE if rth has no alias before itself.
788 */
789static int has_noalias(const struct rtable *head, const struct rtable *rth)
790{
791 const struct rtable *aux = head;
792
793 while (aux != rth) {
794 if (compare_hash_inputs(&aux->fl, &rth->fl))
795 return 0;
796 aux = aux->u.dst.rt_next;
797 }
798 return ONE;
799}
800
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800801static void rt_check_expire(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700802{
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700803 static unsigned int rover;
804 unsigned int i = rover, goal;
Eric Dumazet98376382010-03-08 03:20:00 +0000805 struct rtable *rth, **rthp;
Eric Dumazetcf8da762009-05-19 18:54:22 +0000806 unsigned long samples = 0;
Neil Horman1080d702008-10-27 12:28:25 -0700807 unsigned long sum = 0, sum2 = 0;
Eric Dumazet125bb8f2009-06-11 20:10:07 +0000808 unsigned long delta;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700809 u64 mult;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700810
Eric Dumazet125bb8f2009-06-11 20:10:07 +0000811 delta = jiffies - expires_ljiffies;
812 expires_ljiffies = jiffies;
813 mult = ((u64)delta) << rt_hash_log;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700814 if (ip_rt_gc_timeout > 1)
815 do_div(mult, ip_rt_gc_timeout);
816 goal = (unsigned int)mult;
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700817 if (goal > rt_hash_mask)
818 goal = rt_hash_mask + 1;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700819 for (; goal > 0; goal--) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700820 unsigned long tmo = ip_rt_gc_timeout;
Eric Dumazetcf8da762009-05-19 18:54:22 +0000821 unsigned long length;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700822
823 i = (i + 1) & rt_hash_mask;
824 rthp = &rt_hash_table[i].chain;
825
Eric Dumazetd90bf5a2007-11-14 16:14:05 -0800826 if (need_resched())
827 cond_resched();
828
Neil Horman1080d702008-10-27 12:28:25 -0700829 samples++;
830
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700831 if (*rthp == NULL)
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700832 continue;
Eric Dumazetcf8da762009-05-19 18:54:22 +0000833 length = 0;
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700834 spin_lock_bh(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700835 while ((rth = *rthp) != NULL) {
Eric Dumazet1ddbcb02009-05-19 20:14:28 +0000836 prefetch(rth->u.dst.rt_next);
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700837 if (rt_is_expired(rth)) {
Eric Dumazet29e75252008-01-31 17:05:09 -0800838 *rthp = rth->u.dst.rt_next;
839 rt_free(rth);
840 continue;
841 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700842 if (rth->u.dst.expires) {
843 /* Entry is expired even if it is in use */
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700844 if (time_before_eq(jiffies, rth->u.dst.expires)) {
Eric Dumazet1ddbcb02009-05-19 20:14:28 +0000845nofree:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700846 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800847 rthp = &rth->u.dst.rt_next;
Neil Horman1080d702008-10-27 12:28:25 -0700848 /*
Eric Dumazet1ddbcb02009-05-19 20:14:28 +0000849 * We only count entries on
Neil Horman1080d702008-10-27 12:28:25 -0700850 * a chain with equal hash inputs once
851 * so that entries for different QOS
852 * levels, and other non-hash input
853 * attributes don't unfairly skew
854 * the length computation
855 */
Eric Dumazet98376382010-03-08 03:20:00 +0000856 length += has_noalias(rt_hash_table[i].chain, rth);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700857 continue;
858 }
Eric Dumazet1ddbcb02009-05-19 20:14:28 +0000859 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
860 goto nofree;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700861
862 /* Cleanup aged off entries. */
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800863 *rthp = rth->u.dst.rt_next;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900864 rt_free(rth);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700865 }
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700866 spin_unlock_bh(rt_hash_lock_addr(i));
Neil Horman1080d702008-10-27 12:28:25 -0700867 sum += length;
868 sum2 += length*length;
869 }
870 if (samples) {
871 unsigned long avg = sum / samples;
872 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
873 rt_chain_length_max = max_t(unsigned long,
874 ip_rt_gc_elasticity,
875 (avg + 4*sd) >> FRACT_BITS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700876 }
877 rover = i;
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800878}
879
880/*
881 * rt_worker_func() is run in process context.
Eric Dumazet29e75252008-01-31 17:05:09 -0800882 * we call rt_check_expire() to scan part of the hash table
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800883 */
884static void rt_worker_func(struct work_struct *work)
885{
Eric Dumazet29e75252008-01-31 17:05:09 -0800886 rt_check_expire();
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700887 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700888}
889
Eric Dumazet29e75252008-01-31 17:05:09 -0800890/*
891 * Pertubation of rt_genid by a small quantity [1..256]
892 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
893 * many times (2^24) without giving recent rt_genid.
894 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700895 */
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700896static void rt_cache_invalidate(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700897{
Eric Dumazet29e75252008-01-31 17:05:09 -0800898 unsigned char shuffle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700899
Eric Dumazet29e75252008-01-31 17:05:09 -0800900 get_random_bytes(&shuffle, sizeof(shuffle));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700901 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700902}
903
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800904/*
Eric Dumazet29e75252008-01-31 17:05:09 -0800905 * delay < 0 : invalidate cache (fast : entries will be deleted later)
906 * delay >= 0 : invalidate & flush cache (can be long)
907 */
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -0700908void rt_cache_flush(struct net *net, int delay)
Eric Dumazet29e75252008-01-31 17:05:09 -0800909{
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700910 rt_cache_invalidate(net);
Eric Dumazet29e75252008-01-31 17:05:09 -0800911 if (delay >= 0)
912 rt_do_flush(!in_softirq());
913}
914
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000915/* Flush previous cache invalidated entries from the cache */
916void rt_cache_flush_batch(void)
917{
918 rt_do_flush(!in_softirq());
919}
920
Eric Dumazet29e75252008-01-31 17:05:09 -0800921/*
922 * We change rt_genid and let gc do the cleanup
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800923 */
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -0700924static void rt_secret_rebuild(unsigned long __net)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700925{
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -0700926 struct net *net = (struct net *)__net;
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700927 rt_cache_invalidate(net);
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -0700928 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700929}
930
Neil Horman1080d702008-10-27 12:28:25 -0700931static void rt_secret_rebuild_oneshot(struct net *net)
932{
933 del_timer_sync(&net->ipv4.rt_secret_timer);
934 rt_cache_invalidate(net);
Vitaliy Gusev858a18a2010-03-16 01:07:51 +0000935 if (ip_rt_secret_interval)
936 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
Neil Horman1080d702008-10-27 12:28:25 -0700937}
938
939static void rt_emergency_hash_rebuild(struct net *net)
940{
941 if (net_ratelimit()) {
942 printk(KERN_WARNING "Route hash chain too long!\n");
943 printk(KERN_WARNING "Adjust your secret_interval!\n");
944 }
945
946 rt_secret_rebuild_oneshot(net);
947}
948
Linus Torvalds1da177e2005-04-16 15:20:36 -0700949/*
950 Short description of GC goals.
951
952 We want to build algorithm, which will keep routing cache
953 at some equilibrium point, when number of aged off entries
954 is kept approximately equal to newly generated ones.
955
956 Current expiration strength is variable "expire".
957 We try to adjust it dynamically, so that if networking
958 is idle expires is large enough to keep enough of warm entries,
959 and when load increases it reduces to limit cache size.
960 */
961
Daniel Lezcano569d3642008-01-18 03:56:57 -0800962static int rt_garbage_collect(struct dst_ops *ops)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700963{
964 static unsigned long expire = RT_GC_TIMEOUT;
965 static unsigned long last_gc;
966 static int rover;
967 static int equilibrium;
968 struct rtable *rth, **rthp;
969 unsigned long now = jiffies;
970 int goal;
971
972 /*
973 * Garbage collection is pretty expensive,
974 * do not make it too frequently.
975 */
976
977 RT_CACHE_STAT_INC(gc_total);
978
979 if (now - last_gc < ip_rt_gc_min_interval &&
980 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
981 RT_CACHE_STAT_INC(gc_ignored);
982 goto out;
983 }
984
985 /* Calculate number of entries, which we want to expire now. */
986 goal = atomic_read(&ipv4_dst_ops.entries) -
987 (ip_rt_gc_elasticity << rt_hash_log);
988 if (goal <= 0) {
989 if (equilibrium < ipv4_dst_ops.gc_thresh)
990 equilibrium = ipv4_dst_ops.gc_thresh;
991 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
992 if (goal > 0) {
Eric Dumazetb790ced2007-12-21 01:49:07 -0800993 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700994 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
995 }
996 } else {
997 /* We are in dangerous area. Try to reduce cache really
998 * aggressively.
999 */
Eric Dumazetb790ced2007-12-21 01:49:07 -08001000 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001001 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
1002 }
1003
1004 if (now - last_gc >= ip_rt_gc_min_interval)
1005 last_gc = now;
1006
1007 if (goal <= 0) {
1008 equilibrium += goal;
1009 goto work_done;
1010 }
1011
1012 do {
1013 int i, k;
1014
1015 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1016 unsigned long tmo = expire;
1017
1018 k = (k + 1) & rt_hash_mask;
1019 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001020 spin_lock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001021 while ((rth = *rthp) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001022 if (!rt_is_expired(rth) &&
Eric Dumazet29e75252008-01-31 17:05:09 -08001023 !rt_may_expire(rth, tmo, expire)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001024 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001025 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001026 continue;
1027 }
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001028 *rthp = rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001029 rt_free(rth);
1030 goal--;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001031 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001032 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001033 if (goal <= 0)
1034 break;
1035 }
1036 rover = k;
1037
1038 if (goal <= 0)
1039 goto work_done;
1040
1041 /* Goal is not achieved. We stop process if:
1042
1043 - if expire reduced to zero. Otherwise, expire is halfed.
1044 - if table is not full.
1045 - if we are called from interrupt.
1046 - jiffies check is just fallback/debug loop breaker.
1047 We will not spin here for long time in any case.
1048 */
1049
1050 RT_CACHE_STAT_INC(gc_goal_miss);
1051
1052 if (expire == 0)
1053 break;
1054
1055 expire >>= 1;
1056#if RT_CACHE_DEBUG >= 2
1057 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1058 atomic_read(&ipv4_dst_ops.entries), goal, i);
1059#endif
1060
1061 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1062 goto out;
1063 } while (!in_softirq() && time_before_eq(jiffies, now));
1064
1065 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1066 goto out;
1067 if (net_ratelimit())
1068 printk(KERN_WARNING "dst cache overflow\n");
1069 RT_CACHE_STAT_INC(gc_dst_overflow);
1070 return 1;
1071
1072work_done:
1073 expire += ip_rt_gc_min_interval;
1074 if (expire > ip_rt_gc_timeout ||
1075 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1076 expire = ip_rt_gc_timeout;
1077#if RT_CACHE_DEBUG >= 2
1078 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1079 atomic_read(&ipv4_dst_ops.entries), goal, rover);
1080#endif
1081out: return 0;
1082}
1083
Eric Dumazet98376382010-03-08 03:20:00 +00001084/*
1085 * Returns number of entries in a hash chain that have different hash_inputs
1086 */
1087static int slow_chain_length(const struct rtable *head)
1088{
1089 int length = 0;
1090 const struct rtable *rth = head;
1091
1092 while (rth) {
1093 length += has_noalias(head, rth);
1094 rth = rth->u.dst.rt_next;
1095 }
1096 return length >> FRACT_BITS;
1097}
1098
Eric Dumazet511c3f92009-06-02 05:14:27 +00001099static int rt_intern_hash(unsigned hash, struct rtable *rt,
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001100 struct rtable **rp, struct sk_buff *skb, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001101{
1102 struct rtable *rth, **rthp;
1103 unsigned long now;
1104 struct rtable *cand, **candp;
1105 u32 min_score;
1106 int chain_length;
1107 int attempts = !in_softirq();
1108
1109restart:
1110 chain_length = 0;
1111 min_score = ~(u32)0;
1112 cand = NULL;
1113 candp = NULL;
1114 now = jiffies;
1115
Neil Horman1080d702008-10-27 12:28:25 -07001116 if (!rt_caching(dev_net(rt->u.dst.dev))) {
Neil Horman73e42892009-06-20 01:15:16 -07001117 /*
1118 * If we're not caching, just tell the caller we
1119 * were successful and don't touch the route. The
1120 * caller hold the sole reference to the cache entry, and
1121 * it will be released when the caller is done with it.
1122 * If we drop it here, the callers have no way to resolve routes
1123 * when we're not caching. Instead, just point *rp at rt, so
1124 * the caller gets a single use out of the route
Neil Hormanb6280b42009-06-22 10:18:53 +00001125 * Note that we do rt_free on this new route entry, so that
1126 * once its refcount hits zero, we are still able to reap it
1127 * (Thanks Alexey)
1128 * Note also the rt_free uses call_rcu. We don't actually
1129 * need rcu protection here, this is just our path to get
1130 * on the route gc list.
Neil Horman73e42892009-06-20 01:15:16 -07001131 */
Neil Hormanb6280b42009-06-22 10:18:53 +00001132
1133 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1134 int err = arp_bind_neighbour(&rt->u.dst);
1135 if (err) {
1136 if (net_ratelimit())
1137 printk(KERN_WARNING
1138 "Neighbour table failure & not caching routes.\n");
1139 rt_drop(rt);
1140 return err;
1141 }
1142 }
1143
1144 rt_free(rt);
1145 goto skip_hashing;
Neil Horman1080d702008-10-27 12:28:25 -07001146 }
1147
Linus Torvalds1da177e2005-04-16 15:20:36 -07001148 rthp = &rt_hash_table[hash].chain;
1149
Eric Dumazet22c047c2005-07-05 14:55:24 -07001150 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001151 while ((rth = *rthp) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001152 if (rt_is_expired(rth)) {
Eric Dumazet29e75252008-01-31 17:05:09 -08001153 *rthp = rth->u.dst.rt_next;
1154 rt_free(rth);
1155 continue;
1156 }
Denis V. Lunevb5921912008-01-22 23:50:25 -08001157 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001158 /* Put it first */
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001159 *rthp = rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001160 /*
1161 * Since lookup is lockfree, the deletion
1162 * must be visible to another weakly ordered CPU before
1163 * the insertion at the start of the hash chain.
1164 */
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001165 rcu_assign_pointer(rth->u.dst.rt_next,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001166 rt_hash_table[hash].chain);
1167 /*
1168 * Since lookup is lockfree, the update writes
1169 * must be ordered for consistency on SMP.
1170 */
1171 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1172
Pavel Emelyanov03f49f32007-11-10 21:28:34 -08001173 dst_use(&rth->u.dst, now);
Eric Dumazet22c047c2005-07-05 14:55:24 -07001174 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001175
1176 rt_drop(rt);
Eric Dumazet511c3f92009-06-02 05:14:27 +00001177 if (rp)
1178 *rp = rth;
1179 else
Eric Dumazetadf30902009-06-02 05:19:30 +00001180 skb_dst_set(skb, &rth->u.dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001181 return 0;
1182 }
1183
1184 if (!atomic_read(&rth->u.dst.__refcnt)) {
1185 u32 score = rt_score(rth);
1186
1187 if (score <= min_score) {
1188 cand = rth;
1189 candp = rthp;
1190 min_score = score;
1191 }
1192 }
1193
1194 chain_length++;
1195
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001196 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001197 }
1198
1199 if (cand) {
1200 /* ip_rt_gc_elasticity used to be average length of chain
1201 * length, when exceeded gc becomes really aggressive.
1202 *
1203 * The second limit is less certain. At the moment it allows
1204 * only 2 entries per bucket. We will see.
1205 */
1206 if (chain_length > ip_rt_gc_elasticity) {
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001207 *candp = cand->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001208 rt_free(cand);
1209 }
Neil Horman1080d702008-10-27 12:28:25 -07001210 } else {
Eric Dumazet98376382010-03-08 03:20:00 +00001211 if (chain_length > rt_chain_length_max &&
1212 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
Neil Horman1080d702008-10-27 12:28:25 -07001213 struct net *net = dev_net(rt->u.dst.dev);
1214 int num = ++net->ipv4.current_rt_cache_rebuild_count;
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001215 if (!rt_caching(net)) {
Neil Horman1080d702008-10-27 12:28:25 -07001216 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1217 rt->u.dst.dev->name, num);
1218 }
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001219 rt_emergency_hash_rebuild(net);
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001220 spin_unlock_bh(rt_hash_lock_addr(hash));
1221
1222 hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1223 ifindex, rt_genid(net));
1224 goto restart;
Neil Horman1080d702008-10-27 12:28:25 -07001225 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001226 }
1227
1228 /* Try to bind route to arp only if it is output
1229 route or unicast forwarding path.
1230 */
1231 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1232 int err = arp_bind_neighbour(&rt->u.dst);
1233 if (err) {
Eric Dumazet22c047c2005-07-05 14:55:24 -07001234 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001235
1236 if (err != -ENOBUFS) {
1237 rt_drop(rt);
1238 return err;
1239 }
1240
1241 /* Neighbour tables are full and nothing
1242 can be released. Try to shrink route cache,
1243 it is most likely it holds some neighbour records.
1244 */
1245 if (attempts-- > 0) {
1246 int saved_elasticity = ip_rt_gc_elasticity;
1247 int saved_int = ip_rt_gc_min_interval;
1248 ip_rt_gc_elasticity = 1;
1249 ip_rt_gc_min_interval = 0;
Daniel Lezcano569d3642008-01-18 03:56:57 -08001250 rt_garbage_collect(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001251 ip_rt_gc_min_interval = saved_int;
1252 ip_rt_gc_elasticity = saved_elasticity;
1253 goto restart;
1254 }
1255
1256 if (net_ratelimit())
1257 printk(KERN_WARNING "Neighbour table overflow.\n");
1258 rt_drop(rt);
1259 return -ENOBUFS;
1260 }
1261 }
1262
Eric Dumazet1ddbcb02009-05-19 20:14:28 +00001263 rt->u.dst.rt_next = rt_hash_table[hash].chain;
Neil Horman1080d702008-10-27 12:28:25 -07001264
Linus Torvalds1da177e2005-04-16 15:20:36 -07001265#if RT_CACHE_DEBUG >= 2
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001266 if (rt->u.dst.rt_next) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001267 struct rtable *trt;
Neil Hormanb6280b42009-06-22 10:18:53 +00001268 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1269 hash, &rt->rt_dst);
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001270 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
Harvey Harrison673d57e2008-10-31 00:53:57 -07001271 printk(" . %pI4", &trt->rt_dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001272 printk("\n");
1273 }
1274#endif
Eric Dumazet00269b52008-10-16 14:18:29 -07001275 /*
1276 * Since lookup is lockfree, we must make sure
1277 * previous writes to rt are comitted to memory
1278 * before making rt visible to other CPUS.
1279 */
Eric Dumazet1ddbcb02009-05-19 20:14:28 +00001280 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
Neil Horman1080d702008-10-27 12:28:25 -07001281
Eric Dumazet22c047c2005-07-05 14:55:24 -07001282 spin_unlock_bh(rt_hash_lock_addr(hash));
Neil Horman73e42892009-06-20 01:15:16 -07001283
Neil Hormanb6280b42009-06-22 10:18:53 +00001284skip_hashing:
Eric Dumazet511c3f92009-06-02 05:14:27 +00001285 if (rp)
1286 *rp = rt;
1287 else
Eric Dumazetadf30902009-06-02 05:19:30 +00001288 skb_dst_set(skb, &rt->u.dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001289 return 0;
1290}
1291
1292void rt_bind_peer(struct rtable *rt, int create)
1293{
1294 static DEFINE_SPINLOCK(rt_peer_lock);
1295 struct inet_peer *peer;
1296
1297 peer = inet_getpeer(rt->rt_dst, create);
1298
1299 spin_lock_bh(&rt_peer_lock);
1300 if (rt->peer == NULL) {
1301 rt->peer = peer;
1302 peer = NULL;
1303 }
1304 spin_unlock_bh(&rt_peer_lock);
1305 if (peer)
1306 inet_putpeer(peer);
1307}
1308
1309/*
1310 * Peer allocation may fail only in serious out-of-memory conditions. However
1311 * we still can generate some output.
1312 * Random ID selection looks a bit dangerous because we have no chances to
1313 * select ID being unique in a reasonable period of time.
1314 * But broken packet identifier may be better than no packet at all.
1315 */
1316static void ip_select_fb_ident(struct iphdr *iph)
1317{
1318 static DEFINE_SPINLOCK(ip_fb_id_lock);
1319 static u32 ip_fallback_id;
1320 u32 salt;
1321
1322 spin_lock_bh(&ip_fb_id_lock);
Al Viroe4485152006-09-26 22:15:01 -07001323 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001324 iph->id = htons(salt & 0xFFFF);
1325 ip_fallback_id = salt;
1326 spin_unlock_bh(&ip_fb_id_lock);
1327}
1328
1329void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1330{
1331 struct rtable *rt = (struct rtable *) dst;
1332
1333 if (rt) {
1334 if (rt->peer == NULL)
1335 rt_bind_peer(rt, 1);
1336
1337 /* If peer is attached to destination, it is never detached,
1338 so that we need not to grab a lock to dereference it.
1339 */
1340 if (rt->peer) {
1341 iph->id = htons(inet_getid(rt->peer, more));
1342 return;
1343 }
1344 } else
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001345 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
Stephen Hemminger9c2b3322005-04-19 22:39:42 -07001346 __builtin_return_address(0));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001347
1348 ip_select_fb_ident(iph);
1349}
1350
1351static void rt_del(unsigned hash, struct rtable *rt)
1352{
Eric Dumazet29e75252008-01-31 17:05:09 -08001353 struct rtable **rthp, *aux;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001354
Eric Dumazet29e75252008-01-31 17:05:09 -08001355 rthp = &rt_hash_table[hash].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001356 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001357 ip_rt_put(rt);
Eric Dumazet29e75252008-01-31 17:05:09 -08001358 while ((aux = *rthp) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001359 if (aux == rt || rt_is_expired(aux)) {
Eric Dumazet29e75252008-01-31 17:05:09 -08001360 *rthp = aux->u.dst.rt_next;
1361 rt_free(aux);
1362 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001363 }
Eric Dumazet29e75252008-01-31 17:05:09 -08001364 rthp = &aux->u.dst.rt_next;
1365 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001366 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001367}
1368
Al Virof7655222006-09-26 21:25:43 -07001369void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1370 __be32 saddr, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001371{
1372 int i, k;
1373 struct in_device *in_dev = in_dev_get(dev);
1374 struct rtable *rth, **rthp;
Al Virof7655222006-09-26 21:25:43 -07001375 __be32 skeys[2] = { saddr, 0 };
Linus Torvalds1da177e2005-04-16 15:20:36 -07001376 int ikeys[2] = { dev->ifindex, 0 };
Tom Tucker8d717402006-07-30 20:43:36 -07001377 struct netevent_redirect netevent;
Denis V. Lunev317805b2008-02-28 20:50:06 -08001378 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001379
Linus Torvalds1da177e2005-04-16 15:20:36 -07001380 if (!in_dev)
1381 return;
1382
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001383 net = dev_net(dev);
Joe Perches9d4fb272009-11-23 10:41:23 -08001384 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1385 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1386 ipv4_is_zeronet(new_gw))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001387 goto reject_redirect;
1388
Neil Horman1080d702008-10-27 12:28:25 -07001389 if (!rt_caching(net))
1390 goto reject_redirect;
1391
Linus Torvalds1da177e2005-04-16 15:20:36 -07001392 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1393 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1394 goto reject_redirect;
1395 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1396 goto reject_redirect;
1397 } else {
Denis V. Lunev317805b2008-02-28 20:50:06 -08001398 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001399 goto reject_redirect;
1400 }
1401
1402 for (i = 0; i < 2; i++) {
1403 for (k = 0; k < 2; k++) {
Denis V. Lunevb00180d2008-07-05 19:04:09 -07001404 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001405 rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001406
1407 rthp=&rt_hash_table[hash].chain;
1408
1409 rcu_read_lock();
1410 while ((rth = rcu_dereference(*rthp)) != NULL) {
1411 struct rtable *rt;
1412
1413 if (rth->fl.fl4_dst != daddr ||
1414 rth->fl.fl4_src != skeys[i] ||
Linus Torvalds1da177e2005-04-16 15:20:36 -07001415 rth->fl.oif != ikeys[k] ||
Eric Dumazet29e75252008-01-31 17:05:09 -08001416 rth->fl.iif != 0 ||
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001417 rt_is_expired(rth) ||
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09001418 !net_eq(dev_net(rth->u.dst.dev), net)) {
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001419 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001420 continue;
1421 }
1422
1423 if (rth->rt_dst != daddr ||
1424 rth->rt_src != saddr ||
1425 rth->u.dst.error ||
1426 rth->rt_gateway != old_gw ||
1427 rth->u.dst.dev != dev)
1428 break;
1429
1430 dst_hold(&rth->u.dst);
1431 rcu_read_unlock();
1432
1433 rt = dst_alloc(&ipv4_dst_ops);
1434 if (rt == NULL) {
1435 ip_rt_put(rth);
1436 in_dev_put(in_dev);
1437 return;
1438 }
1439
1440 /* Copy all the information. */
1441 *rt = *rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001442 rt->u.dst.__use = 1;
1443 atomic_set(&rt->u.dst.__refcnt, 1);
1444 rt->u.dst.child = NULL;
1445 if (rt->u.dst.dev)
1446 dev_hold(rt->u.dst.dev);
1447 if (rt->idev)
1448 in_dev_hold(rt->idev);
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001449 rt->u.dst.obsolete = -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001450 rt->u.dst.lastuse = jiffies;
1451 rt->u.dst.path = &rt->u.dst;
1452 rt->u.dst.neighbour = NULL;
1453 rt->u.dst.hh = NULL;
Alexey Dobriyandef8b4f2008-10-28 13:24:06 -07001454#ifdef CONFIG_XFRM
Linus Torvalds1da177e2005-04-16 15:20:36 -07001455 rt->u.dst.xfrm = NULL;
Alexey Dobriyandef8b4f2008-10-28 13:24:06 -07001456#endif
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001457 rt->rt_genid = rt_genid(net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001458 rt->rt_flags |= RTCF_REDIRECTED;
1459
1460 /* Gateway is different ... */
1461 rt->rt_gateway = new_gw;
1462
1463 /* Redirect received -> path was valid */
1464 dst_confirm(&rth->u.dst);
1465
1466 if (rt->peer)
1467 atomic_inc(&rt->peer->refcnt);
1468
1469 if (arp_bind_neighbour(&rt->u.dst) ||
1470 !(rt->u.dst.neighbour->nud_state &
1471 NUD_VALID)) {
1472 if (rt->u.dst.neighbour)
1473 neigh_event_send(rt->u.dst.neighbour, NULL);
1474 ip_rt_put(rth);
1475 rt_drop(rt);
1476 goto do_next;
1477 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001478
Tom Tucker8d717402006-07-30 20:43:36 -07001479 netevent.old = &rth->u.dst;
1480 netevent.new = &rt->u.dst;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001481 call_netevent_notifiers(NETEVENT_REDIRECT,
1482 &netevent);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001483
1484 rt_del(hash, rth);
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001485 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001486 ip_rt_put(rt);
1487 goto do_next;
1488 }
1489 rcu_read_unlock();
1490 do_next:
1491 ;
1492 }
1493 }
1494 in_dev_put(in_dev);
1495 return;
1496
1497reject_redirect:
1498#ifdef CONFIG_IP_ROUTE_VERBOSE
1499 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07001500 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1501 " Advised path = %pI4 -> %pI4\n",
1502 &old_gw, dev->name, &new_gw,
1503 &saddr, &daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001504#endif
1505 in_dev_put(in_dev);
1506}
1507
1508static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1509{
Eric Dumazetee6b9672008-03-05 18:30:47 -08001510 struct rtable *rt = (struct rtable *)dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001511 struct dst_entry *ret = dst;
1512
1513 if (rt) {
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001514 if (dst->obsolete > 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001515 ip_rt_put(rt);
1516 ret = NULL;
1517 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
Guenter Roeck5e016cb2010-03-21 20:55:13 -07001518 (rt->u.dst.expires &&
1519 time_after_eq(jiffies, rt->u.dst.expires))) {
Al Viro8c7bc842006-09-26 21:26:19 -07001520 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
Denis V. Lunevb00180d2008-07-05 19:04:09 -07001521 rt->fl.oif,
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001522 rt_genid(dev_net(dst->dev)));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001523#if RT_CACHE_DEBUG >= 1
Harvey Harrison673d57e2008-10-31 00:53:57 -07001524 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1525 &rt->rt_dst, rt->fl.fl4_tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001526#endif
1527 rt_del(hash, rt);
1528 ret = NULL;
1529 }
1530 }
1531 return ret;
1532}
1533
1534/*
1535 * Algorithm:
1536 * 1. The first ip_rt_redirect_number redirects are sent
1537 * with exponential backoff, then we stop sending them at all,
1538 * assuming that the host ignores our redirects.
1539 * 2. If we did not see packets requiring redirects
1540 * during ip_rt_redirect_silence, we assume that the host
1541 * forgot redirected route and start to send redirects again.
1542 *
1543 * This algorithm is much cheaper and more intelligent than dumb load limiting
1544 * in icmp.c.
1545 *
1546 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1547 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1548 */
1549
1550void ip_rt_send_redirect(struct sk_buff *skb)
1551{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001552 struct rtable *rt = skb_rtable(skb);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001553 struct in_device *in_dev;
1554 int log_martians;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001555
Eric Dumazet30038fc2009-08-28 23:52:01 -07001556 rcu_read_lock();
1557 in_dev = __in_dev_get_rcu(rt->u.dst.dev);
1558 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1559 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001560 return;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001561 }
1562 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1563 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001564
1565 /* No redirected packets during ip_rt_redirect_silence;
1566 * reset the algorithm.
1567 */
1568 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1569 rt->u.dst.rate_tokens = 0;
1570
1571 /* Too many ignored redirects; do not send anything
1572 * set u.dst.rate_last to the last seen redirected packet.
1573 */
1574 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1575 rt->u.dst.rate_last = jiffies;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001576 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001577 }
1578
1579 /* Check for load limit; set rate_last to the latest sent
1580 * redirect.
1581 */
Li Yewang14fb8a72006-12-18 00:26:35 -08001582 if (rt->u.dst.rate_tokens == 0 ||
1583 time_after(jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001584 (rt->u.dst.rate_last +
1585 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1586 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1587 rt->u.dst.rate_last = jiffies;
1588 ++rt->u.dst.rate_tokens;
1589#ifdef CONFIG_IP_ROUTE_VERBOSE
Eric Dumazet30038fc2009-08-28 23:52:01 -07001590 if (log_martians &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001591 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1592 net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07001593 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1594 &rt->rt_src, rt->rt_iif,
1595 &rt->rt_dst, &rt->rt_gateway);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001596#endif
1597 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001598}
1599
1600static int ip_error(struct sk_buff *skb)
1601{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001602 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001603 unsigned long now;
1604 int code;
1605
1606 switch (rt->u.dst.error) {
1607 case EINVAL:
1608 default:
1609 goto out;
1610 case EHOSTUNREACH:
1611 code = ICMP_HOST_UNREACH;
1612 break;
1613 case ENETUNREACH:
1614 code = ICMP_NET_UNREACH;
Pavel Emelyanov7c73a6f2008-07-16 20:20:11 -07001615 IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1616 IPSTATS_MIB_INNOROUTES);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001617 break;
1618 case EACCES:
1619 code = ICMP_PKT_FILTERED;
1620 break;
1621 }
1622
1623 now = jiffies;
1624 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1625 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1626 rt->u.dst.rate_tokens = ip_rt_error_burst;
1627 rt->u.dst.rate_last = now;
1628 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1629 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1630 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1631 }
1632
1633out: kfree_skb(skb);
1634 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001635}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001636
1637/*
1638 * The last two values are not from the RFC but
1639 * are needed for AMPRnet AX.25 paths.
1640 */
1641
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001642static const unsigned short mtu_plateau[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -07001643{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1644
Stephen Hemminger5969f712008-04-10 01:52:09 -07001645static inline unsigned short guess_mtu(unsigned short old_mtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001646{
1647 int i;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001648
Linus Torvalds1da177e2005-04-16 15:20:36 -07001649 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1650 if (old_mtu > mtu_plateau[i])
1651 return mtu_plateau[i];
1652 return 68;
1653}
1654
Denis V. Lunevb5921912008-01-22 23:50:25 -08001655unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
Timo Teras0010e462008-04-29 03:32:25 -07001656 unsigned short new_mtu,
1657 struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001658{
Timo Teras0010e462008-04-29 03:32:25 -07001659 int i, k;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001660 unsigned short old_mtu = ntohs(iph->tot_len);
1661 struct rtable *rth;
Timo Teras0010e462008-04-29 03:32:25 -07001662 int ikeys[2] = { dev->ifindex, 0 };
Al Viroe4485152006-09-26 22:15:01 -07001663 __be32 skeys[2] = { iph->saddr, 0, };
1664 __be32 daddr = iph->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001665 unsigned short est_mtu = 0;
1666
Timo Teras0010e462008-04-29 03:32:25 -07001667 for (k = 0; k < 2; k++) {
1668 for (i = 0; i < 2; i++) {
Denis V. Lunevb00180d2008-07-05 19:04:09 -07001669 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001670 rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001671
Timo Teras0010e462008-04-29 03:32:25 -07001672 rcu_read_lock();
1673 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1674 rth = rcu_dereference(rth->u.dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001675 unsigned short mtu = new_mtu;
1676
Timo Teras0010e462008-04-29 03:32:25 -07001677 if (rth->fl.fl4_dst != daddr ||
1678 rth->fl.fl4_src != skeys[i] ||
1679 rth->rt_dst != daddr ||
1680 rth->rt_src != iph->saddr ||
1681 rth->fl.oif != ikeys[k] ||
1682 rth->fl.iif != 0 ||
1683 dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1684 !net_eq(dev_net(rth->u.dst.dev), net) ||
Hugh Dickins6c3b8fc2008-07-26 17:51:06 -07001685 rt_is_expired(rth))
Timo Teras0010e462008-04-29 03:32:25 -07001686 continue;
1687
Linus Torvalds1da177e2005-04-16 15:20:36 -07001688 if (new_mtu < 68 || new_mtu >= old_mtu) {
1689
1690 /* BSD 4.2 compatibility hack :-( */
1691 if (mtu == 0 &&
Rami Rosen6d273f82008-08-06 02:33:49 -07001692 old_mtu >= dst_mtu(&rth->u.dst) &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001693 old_mtu >= 68 + (iph->ihl << 2))
1694 old_mtu -= iph->ihl << 2;
1695
1696 mtu = guess_mtu(old_mtu);
1697 }
Rami Rosen6d273f82008-08-06 02:33:49 -07001698 if (mtu <= dst_mtu(&rth->u.dst)) {
1699 if (mtu < dst_mtu(&rth->u.dst)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001700 dst_confirm(&rth->u.dst);
1701 if (mtu < ip_rt_min_pmtu) {
1702 mtu = ip_rt_min_pmtu;
1703 rth->u.dst.metrics[RTAX_LOCK-1] |=
1704 (1 << RTAX_MTU);
1705 }
1706 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1707 dst_set_expires(&rth->u.dst,
1708 ip_rt_mtu_expires);
1709 }
1710 est_mtu = mtu;
1711 }
1712 }
Timo Teras0010e462008-04-29 03:32:25 -07001713 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001714 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001715 }
1716 return est_mtu ? : new_mtu;
1717}
1718
1719static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1720{
Rami Rosen6d273f82008-08-06 02:33:49 -07001721 if (dst_mtu(dst) > mtu && mtu >= 68 &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001722 !(dst_metric_locked(dst, RTAX_MTU))) {
1723 if (mtu < ip_rt_min_pmtu) {
1724 mtu = ip_rt_min_pmtu;
1725 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1726 }
1727 dst->metrics[RTAX_MTU-1] = mtu;
1728 dst_set_expires(dst, ip_rt_mtu_expires);
Tom Tucker8d717402006-07-30 20:43:36 -07001729 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001730 }
1731}
1732
1733static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1734{
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001735 if (rt_is_expired((struct rtable *)dst))
1736 return NULL;
1737 return dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001738}
1739
1740static void ipv4_dst_destroy(struct dst_entry *dst)
1741{
1742 struct rtable *rt = (struct rtable *) dst;
1743 struct inet_peer *peer = rt->peer;
1744 struct in_device *idev = rt->idev;
1745
1746 if (peer) {
1747 rt->peer = NULL;
1748 inet_putpeer(peer);
1749 }
1750
1751 if (idev) {
1752 rt->idev = NULL;
1753 in_dev_put(idev);
1754 }
1755}
1756
1757static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1758 int how)
1759{
1760 struct rtable *rt = (struct rtable *) dst;
1761 struct in_device *idev = rt->idev;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001762 if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
Denis V. Lunev5a3e55d2007-12-07 00:38:10 -08001763 struct in_device *loopback_idev =
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001764 in_dev_get(dev_net(dev)->loopback_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001765 if (loopback_idev) {
1766 rt->idev = loopback_idev;
1767 in_dev_put(idev);
1768 }
1769 }
1770}
1771
1772static void ipv4_link_failure(struct sk_buff *skb)
1773{
1774 struct rtable *rt;
1775
1776 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1777
Eric Dumazet511c3f92009-06-02 05:14:27 +00001778 rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001779 if (rt)
1780 dst_set_expires(&rt->u.dst, 0);
1781}
1782
1783static int ip_rt_bug(struct sk_buff *skb)
1784{
Harvey Harrison673d57e2008-10-31 00:53:57 -07001785 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1786 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001787 skb->dev ? skb->dev->name : "?");
1788 kfree_skb(skb);
1789 return 0;
1790}
1791
1792/*
1793 We do not cache source address of outgoing interface,
1794 because it is used only by IP RR, TS and SRR options,
1795 so that it out of fast path.
1796
1797 BTW remember: "addr" is allowed to be not aligned
1798 in IP options!
1799 */
1800
1801void ip_rt_get_source(u8 *addr, struct rtable *rt)
1802{
Al Viroa61ced52006-09-26 21:27:54 -07001803 __be32 src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001804 struct fib_result res;
1805
1806 if (rt->fl.iif == 0)
1807 src = rt->rt_src;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001808 else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001809 src = FIB_RES_PREFSRC(res);
1810 fib_res_put(&res);
1811 } else
1812 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1813 RT_SCOPE_UNIVERSE);
1814 memcpy(addr, &src, 4);
1815}
1816
1817#ifdef CONFIG_NET_CLS_ROUTE
1818static void set_class_tag(struct rtable *rt, u32 tag)
1819{
1820 if (!(rt->u.dst.tclassid & 0xFFFF))
1821 rt->u.dst.tclassid |= tag & 0xFFFF;
1822 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1823 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1824}
1825#endif
1826
1827static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1828{
1829 struct fib_info *fi = res->fi;
1830
1831 if (fi) {
1832 if (FIB_RES_GW(*res) &&
1833 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1834 rt->rt_gateway = FIB_RES_GW(*res);
1835 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1836 sizeof(rt->u.dst.metrics));
1837 if (fi->fib_mtu == 0) {
1838 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
Satoru SATOH0bbeafd2008-05-04 22:12:43 -07001839 if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001840 rt->rt_gateway != rt->rt_dst &&
1841 rt->u.dst.dev->mtu > 576)
1842 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1843 }
1844#ifdef CONFIG_NET_CLS_ROUTE
1845 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1846#endif
1847 } else
1848 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1849
Satoru SATOH5ffc02a2008-05-04 22:14:42 -07001850 if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001851 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
Rami Rosen6d273f82008-08-06 02:33:49 -07001852 if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001853 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
Satoru SATOH5ffc02a2008-05-04 22:14:42 -07001854 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001855 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1856 ip_rt_min_advmss);
Satoru SATOH5ffc02a2008-05-04 22:14:42 -07001857 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001858 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1859
1860#ifdef CONFIG_NET_CLS_ROUTE
1861#ifdef CONFIG_IP_MULTIPLE_TABLES
1862 set_class_tag(rt, fib_rules_tclass(res));
1863#endif
1864 set_class_tag(rt, itag);
1865#endif
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001866 rt->rt_type = res->type;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001867}
1868
Al Viro9e12bb22006-09-26 21:25:20 -07001869static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001870 u8 tos, struct net_device *dev, int our)
1871{
1872 unsigned hash;
1873 struct rtable *rth;
Al Viroa61ced52006-09-26 21:27:54 -07001874 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001875 struct in_device *in_dev = in_dev_get(dev);
1876 u32 itag = 0;
1877
1878 /* Primary sanity checks. */
1879
1880 if (in_dev == NULL)
1881 return -EINVAL;
1882
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001883 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08001884 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001885 goto e_inval;
1886
Joe Perchesf97c1e02007-12-16 13:45:43 -08001887 if (ipv4_is_zeronet(saddr)) {
1888 if (!ipv4_is_local_multicast(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001889 goto e_inval;
1890 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1891 } else if (fib_validate_source(saddr, 0, tos, 0,
jamalb0c110c2009-10-18 02:12:33 +00001892 dev, &spec_dst, &itag, 0) < 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001893 goto e_inval;
1894
1895 rth = dst_alloc(&ipv4_dst_ops);
1896 if (!rth)
1897 goto e_nobufs;
1898
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001899 rth->u.dst.output = ip_rt_bug;
1900 rth->u.dst.obsolete = -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001901
1902 atomic_set(&rth->u.dst.__refcnt, 1);
1903 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07001904 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001905 rth->u.dst.flags |= DST_NOPOLICY;
1906 rth->fl.fl4_dst = daddr;
1907 rth->rt_dst = daddr;
1908 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001909 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001910 rth->fl.fl4_src = saddr;
1911 rth->rt_src = saddr;
1912#ifdef CONFIG_NET_CLS_ROUTE
1913 rth->u.dst.tclassid = itag;
1914#endif
1915 rth->rt_iif =
1916 rth->fl.iif = dev->ifindex;
Eric W. Biederman2774c7a2007-09-26 22:10:56 -07001917 rth->u.dst.dev = init_net.loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001918 dev_hold(rth->u.dst.dev);
1919 rth->idev = in_dev_get(rth->u.dst.dev);
1920 rth->fl.oif = 0;
1921 rth->rt_gateway = daddr;
1922 rth->rt_spec_dst= spec_dst;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001923 rth->rt_genid = rt_genid(dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001924 rth->rt_flags = RTCF_MULTICAST;
Eric Dumazet29e75252008-01-31 17:05:09 -08001925 rth->rt_type = RTN_MULTICAST;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001926 if (our) {
1927 rth->u.dst.input= ip_local_deliver;
1928 rth->rt_flags |= RTCF_LOCAL;
1929 }
1930
1931#ifdef CONFIG_IP_MROUTE
Joe Perchesf97c1e02007-12-16 13:45:43 -08001932 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001933 rth->u.dst.input = ip_mr_input;
1934#endif
1935 RT_CACHE_STAT_INC(in_slow_mc);
1936
1937 in_dev_put(in_dev);
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001938 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001939 return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001940
1941e_nobufs:
1942 in_dev_put(in_dev);
1943 return -ENOBUFS;
1944
1945e_inval:
1946 in_dev_put(in_dev);
1947 return -EINVAL;
1948}
1949
1950
1951static void ip_handle_martian_source(struct net_device *dev,
1952 struct in_device *in_dev,
1953 struct sk_buff *skb,
Al Viro9e12bb22006-09-26 21:25:20 -07001954 __be32 daddr,
1955 __be32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001956{
1957 RT_CACHE_STAT_INC(in_martian_src);
1958#ifdef CONFIG_IP_ROUTE_VERBOSE
1959 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1960 /*
1961 * RFC1812 recommendation, if source is martian,
1962 * the only hint is MAC header.
1963 */
Harvey Harrison673d57e2008-10-31 00:53:57 -07001964 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1965 &daddr, &saddr, dev->name);
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001966 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001967 int i;
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001968 const unsigned char *p = skb_mac_header(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001969 printk(KERN_WARNING "ll header: ");
1970 for (i = 0; i < dev->hard_header_len; i++, p++) {
1971 printk("%02x", *p);
1972 if (i < (dev->hard_header_len - 1))
1973 printk(":");
1974 }
1975 printk("\n");
1976 }
1977 }
1978#endif
1979}
1980
Stephen Hemminger5969f712008-04-10 01:52:09 -07001981static int __mkroute_input(struct sk_buff *skb,
1982 struct fib_result *res,
1983 struct in_device *in_dev,
1984 __be32 daddr, __be32 saddr, u32 tos,
1985 struct rtable **result)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001986{
1987
1988 struct rtable *rth;
1989 int err;
1990 struct in_device *out_dev;
1991 unsigned flags = 0;
Al Virod9c9df82006-09-26 21:28:14 -07001992 __be32 spec_dst;
1993 u32 itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001994
1995 /* get a working reference to the output device */
1996 out_dev = in_dev_get(FIB_RES_DEV(*res));
1997 if (out_dev == NULL) {
1998 if (net_ratelimit())
1999 printk(KERN_CRIT "Bug in ip_route_input" \
2000 "_slow(). Please, report\n");
2001 return -EINVAL;
2002 }
2003
2004
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002005 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
jamalb0c110c2009-10-18 02:12:33 +00002006 in_dev->dev, &spec_dst, &itag, skb->mark);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002007 if (err < 0) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002008 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002009 saddr);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002010
Linus Torvalds1da177e2005-04-16 15:20:36 -07002011 err = -EINVAL;
2012 goto cleanup;
2013 }
2014
2015 if (err)
2016 flags |= RTCF_DIRECTSRC;
2017
Thomas Graf51b77ca2008-06-03 16:36:01 -07002018 if (out_dev == in_dev && err &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002019 (IN_DEV_SHARED_MEDIA(out_dev) ||
2020 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2021 flags |= RTCF_DOREDIRECT;
2022
2023 if (skb->protocol != htons(ETH_P_IP)) {
2024 /* Not IP (i.e. ARP). Do not create route, if it is
2025 * invalid for proxy arp. DNAT routes are always valid.
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00002026 *
2027 * Proxy arp feature have been extended to allow, ARP
2028 * replies back to the same interface, to support
2029 * Private VLAN switch technologies. See arp.c.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002030 */
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00002031 if (out_dev == in_dev &&
2032 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002033 err = -EINVAL;
2034 goto cleanup;
2035 }
2036 }
2037
2038
2039 rth = dst_alloc(&ipv4_dst_ops);
2040 if (!rth) {
2041 err = -ENOBUFS;
2042 goto cleanup;
2043 }
2044
Julian Anastasovce723d82005-09-08 13:34:47 -07002045 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002046 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07002047 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002048 rth->u.dst.flags |= DST_NOPOLICY;
Herbert Xu42f811b2007-06-04 23:34:44 -07002049 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002050 rth->u.dst.flags |= DST_NOXFRM;
2051 rth->fl.fl4_dst = daddr;
2052 rth->rt_dst = daddr;
2053 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002054 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002055 rth->fl.fl4_src = saddr;
2056 rth->rt_src = saddr;
2057 rth->rt_gateway = daddr;
2058 rth->rt_iif =
2059 rth->fl.iif = in_dev->dev->ifindex;
2060 rth->u.dst.dev = (out_dev)->dev;
2061 dev_hold(rth->u.dst.dev);
2062 rth->idev = in_dev_get(rth->u.dst.dev);
2063 rth->fl.oif = 0;
2064 rth->rt_spec_dst= spec_dst;
2065
Timo Teräsd11a4dc2010-03-18 23:20:20 +00002066 rth->u.dst.obsolete = -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002067 rth->u.dst.input = ip_forward;
2068 rth->u.dst.output = ip_output;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002069 rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002070
2071 rt_set_nexthop(rth, res, itag);
2072
2073 rth->rt_flags = flags;
2074
2075 *result = rth;
2076 err = 0;
2077 cleanup:
2078 /* release the working reference to the output device */
2079 in_dev_put(out_dev);
2080 return err;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002081}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002082
Stephen Hemminger5969f712008-04-10 01:52:09 -07002083static int ip_mkroute_input(struct sk_buff *skb,
2084 struct fib_result *res,
2085 const struct flowi *fl,
2086 struct in_device *in_dev,
2087 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002088{
Chuck Short7abaa272005-06-22 22:10:23 -07002089 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002090 int err;
2091 unsigned hash;
2092
2093#ifdef CONFIG_IP_ROUTE_MULTIPATH
2094 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2095 fib_select_multipath(fl, res);
2096#endif
2097
2098 /* create a routing cache entry */
2099 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2100 if (err)
2101 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002102
2103 /* put it into the cache */
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002104 hash = rt_hash(daddr, saddr, fl->iif,
2105 rt_genid(dev_net(rth->u.dst.dev)));
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00002106 return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002107}
2108
Linus Torvalds1da177e2005-04-16 15:20:36 -07002109/*
2110 * NOTE. We drop all the packets that has local source
2111 * addresses, because every properly looped back packet
2112 * must have correct destination already attached by output routine.
2113 *
2114 * Such approach solves two big problems:
2115 * 1. Not simplex devices are handled properly.
2116 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2117 */
2118
Al Viro9e12bb22006-09-26 21:25:20 -07002119static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002120 u8 tos, struct net_device *dev)
2121{
2122 struct fib_result res;
2123 struct in_device *in_dev = in_dev_get(dev);
2124 struct flowi fl = { .nl_u = { .ip4_u =
2125 { .daddr = daddr,
2126 .saddr = saddr,
2127 .tos = tos,
2128 .scope = RT_SCOPE_UNIVERSE,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002129 } },
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002130 .mark = skb->mark,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002131 .iif = dev->ifindex };
2132 unsigned flags = 0;
2133 u32 itag = 0;
2134 struct rtable * rth;
2135 unsigned hash;
Al Viro9e12bb22006-09-26 21:25:20 -07002136 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002137 int err = -EINVAL;
2138 int free_res = 0;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002139 struct net * net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002140
2141 /* IP on this device is disabled. */
2142
2143 if (!in_dev)
2144 goto out;
2145
2146 /* Check for the most weird martians, which can be not detected
2147 by fib_lookup.
2148 */
2149
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002150 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002151 ipv4_is_loopback(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002152 goto martian_source;
2153
Al Viroe4485152006-09-26 22:15:01 -07002154 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002155 goto brd_input;
2156
2157 /* Accept zero addresses only to limited broadcast;
2158 * I even do not know to fix it or not. Waiting for complains :-)
2159 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002160 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002161 goto martian_source;
2162
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002163 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002164 ipv4_is_loopback(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002165 goto martian_destination;
2166
2167 /*
2168 * Now we are ready to route packet.
2169 */
Denis V. Lunev84a885f2008-01-21 17:34:35 -08002170 if ((err = fib_lookup(net, &fl, &res)) != 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002171 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002172 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002173 goto no_route;
2174 }
2175 free_res = 1;
2176
2177 RT_CACHE_STAT_INC(in_slow_tot);
2178
2179 if (res.type == RTN_BROADCAST)
2180 goto brd_input;
2181
2182 if (res.type == RTN_LOCAL) {
2183 int result;
2184 result = fib_validate_source(saddr, daddr, tos,
Denis V. Lunev84a885f2008-01-21 17:34:35 -08002185 net->loopback_dev->ifindex,
jamalb0c110c2009-10-18 02:12:33 +00002186 dev, &spec_dst, &itag, skb->mark);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002187 if (result < 0)
2188 goto martian_source;
2189 if (result)
2190 flags |= RTCF_DIRECTSRC;
2191 spec_dst = daddr;
2192 goto local_input;
2193 }
2194
2195 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002196 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002197 if (res.type != RTN_UNICAST)
2198 goto martian_destination;
2199
2200 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002201done:
2202 in_dev_put(in_dev);
2203 if (free_res)
2204 fib_res_put(&res);
2205out: return err;
2206
2207brd_input:
2208 if (skb->protocol != htons(ETH_P_IP))
2209 goto e_inval;
2210
Joe Perchesf97c1e02007-12-16 13:45:43 -08002211 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002212 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2213 else {
2214 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
jamalb0c110c2009-10-18 02:12:33 +00002215 &itag, skb->mark);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002216 if (err < 0)
2217 goto martian_source;
2218 if (err)
2219 flags |= RTCF_DIRECTSRC;
2220 }
2221 flags |= RTCF_BROADCAST;
2222 res.type = RTN_BROADCAST;
2223 RT_CACHE_STAT_INC(in_brd);
2224
2225local_input:
2226 rth = dst_alloc(&ipv4_dst_ops);
2227 if (!rth)
2228 goto e_nobufs;
2229
2230 rth->u.dst.output= ip_rt_bug;
Timo Teräsd11a4dc2010-03-18 23:20:20 +00002231 rth->u.dst.obsolete = -1;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002232 rth->rt_genid = rt_genid(net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002233
2234 atomic_set(&rth->u.dst.__refcnt, 1);
2235 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07002236 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002237 rth->u.dst.flags |= DST_NOPOLICY;
2238 rth->fl.fl4_dst = daddr;
2239 rth->rt_dst = daddr;
2240 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002241 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002242 rth->fl.fl4_src = saddr;
2243 rth->rt_src = saddr;
2244#ifdef CONFIG_NET_CLS_ROUTE
2245 rth->u.dst.tclassid = itag;
2246#endif
2247 rth->rt_iif =
2248 rth->fl.iif = dev->ifindex;
Denis V. Lunev84a885f2008-01-21 17:34:35 -08002249 rth->u.dst.dev = net->loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002250 dev_hold(rth->u.dst.dev);
2251 rth->idev = in_dev_get(rth->u.dst.dev);
2252 rth->rt_gateway = daddr;
2253 rth->rt_spec_dst= spec_dst;
2254 rth->u.dst.input= ip_local_deliver;
2255 rth->rt_flags = flags|RTCF_LOCAL;
2256 if (res.type == RTN_UNREACHABLE) {
2257 rth->u.dst.input= ip_error;
2258 rth->u.dst.error= -err;
2259 rth->rt_flags &= ~RTCF_LOCAL;
2260 }
2261 rth->rt_type = res.type;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002262 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00002263 err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002264 goto done;
2265
2266no_route:
2267 RT_CACHE_STAT_INC(in_no_route);
2268 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2269 res.type = RTN_UNREACHABLE;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08002270 if (err == -ESRCH)
2271 err = -ENETUNREACH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002272 goto local_input;
2273
2274 /*
2275 * Do not cache martian addresses: they should be logged (RFC1812)
2276 */
2277martian_destination:
2278 RT_CACHE_STAT_INC(in_martian_dst);
2279#ifdef CONFIG_IP_ROUTE_VERBOSE
2280 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07002281 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2282 &daddr, &saddr, dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002283#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002284
2285e_hostunreach:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002286 err = -EHOSTUNREACH;
2287 goto done;
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002288
Linus Torvalds1da177e2005-04-16 15:20:36 -07002289e_inval:
2290 err = -EINVAL;
2291 goto done;
2292
2293e_nobufs:
2294 err = -ENOBUFS;
2295 goto done;
2296
2297martian_source:
2298 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2299 goto e_inval;
2300}
2301
Al Viro9e12bb22006-09-26 21:25:20 -07002302int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002303 u8 tos, struct net_device *dev)
2304{
2305 struct rtable * rth;
2306 unsigned hash;
2307 int iif = dev->ifindex;
Denis V. Lunevb5921912008-01-22 23:50:25 -08002308 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002309
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002310 net = dev_net(dev);
Neil Horman1080d702008-10-27 12:28:25 -07002311
2312 if (!rt_caching(net))
2313 goto skip_cache;
2314
Linus Torvalds1da177e2005-04-16 15:20:36 -07002315 tos &= IPTOS_RT_MASK;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002316 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002317
2318 rcu_read_lock();
2319 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08002320 rth = rcu_dereference(rth->u.dst.rt_next)) {
Stephen Hemmingerc0b8c322008-04-10 04:00:28 -07002321 if (((rth->fl.fl4_dst ^ daddr) |
2322 (rth->fl.fl4_src ^ saddr) |
2323 (rth->fl.iif ^ iif) |
2324 rth->fl.oif |
2325 (rth->fl.fl4_tos ^ tos)) == 0 &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002326 rth->fl.mark == skb->mark &&
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09002327 net_eq(dev_net(rth->u.dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002328 !rt_is_expired(rth)) {
Pavel Emelyanov03f49f32007-11-10 21:28:34 -08002329 dst_use(&rth->u.dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002330 RT_CACHE_STAT_INC(in_hit);
2331 rcu_read_unlock();
Eric Dumazetadf30902009-06-02 05:19:30 +00002332 skb_dst_set(skb, &rth->u.dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002333 return 0;
2334 }
2335 RT_CACHE_STAT_INC(in_hlist_search);
2336 }
2337 rcu_read_unlock();
2338
Neil Horman1080d702008-10-27 12:28:25 -07002339skip_cache:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002340 /* Multicast recognition logic is moved from route cache to here.
2341 The problem was that too many Ethernet cards have broken/missing
2342 hardware multicast filters :-( As result the host on multicasting
2343 network acquires a lot of useless route cache entries, sort of
2344 SDR messages from all the world. Now we try to get rid of them.
2345 Really, provided software IP multicast filter is organized
2346 reasonably (at least, hashed), it does not result in a slowdown
2347 comparing with route cache reject entries.
2348 Note, that multicast routers are not affected, because
2349 route cache entry is created eventually.
2350 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002351 if (ipv4_is_multicast(daddr)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002352 struct in_device *in_dev;
2353
2354 rcu_read_lock();
Herbert Xue5ed6392005-10-03 14:35:55 -07002355 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002356 int our = ip_check_mc(in_dev, daddr, saddr,
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07002357 ip_hdr(skb)->protocol);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002358 if (our
2359#ifdef CONFIG_IP_MROUTE
Joe Perches9d4fb272009-11-23 10:41:23 -08002360 ||
2361 (!ipv4_is_local_multicast(daddr) &&
2362 IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002363#endif
Joe Perches9d4fb272009-11-23 10:41:23 -08002364 ) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002365 rcu_read_unlock();
2366 return ip_route_input_mc(skb, daddr, saddr,
2367 tos, dev, our);
2368 }
2369 }
2370 rcu_read_unlock();
2371 return -EINVAL;
2372 }
2373 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2374}
2375
Stephen Hemminger5969f712008-04-10 01:52:09 -07002376static int __mkroute_output(struct rtable **result,
2377 struct fib_result *res,
2378 const struct flowi *fl,
2379 const struct flowi *oldflp,
2380 struct net_device *dev_out,
2381 unsigned flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002382{
2383 struct rtable *rth;
2384 struct in_device *in_dev;
2385 u32 tos = RT_FL_TOS(oldflp);
2386 int err = 0;
2387
Joe Perchesf97c1e02007-12-16 13:45:43 -08002388 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002389 return -EINVAL;
2390
Al Viroe4485152006-09-26 22:15:01 -07002391 if (fl->fl4_dst == htonl(0xFFFFFFFF))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002392 res->type = RTN_BROADCAST;
Joe Perchesf97c1e02007-12-16 13:45:43 -08002393 else if (ipv4_is_multicast(fl->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002394 res->type = RTN_MULTICAST;
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002395 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002396 return -EINVAL;
2397
2398 if (dev_out->flags & IFF_LOOPBACK)
2399 flags |= RTCF_LOCAL;
2400
2401 /* get work reference to inet device */
2402 in_dev = in_dev_get(dev_out);
2403 if (!in_dev)
2404 return -EINVAL;
2405
2406 if (res->type == RTN_BROADCAST) {
2407 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2408 if (res->fi) {
2409 fib_info_put(res->fi);
2410 res->fi = NULL;
2411 }
2412 } else if (res->type == RTN_MULTICAST) {
2413 flags |= RTCF_MULTICAST|RTCF_LOCAL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002414 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002415 oldflp->proto))
2416 flags &= ~RTCF_LOCAL;
2417 /* If multicast route do not exist use
2418 default one, but do not gateway in this case.
2419 Yes, it is hack.
2420 */
2421 if (res->fi && res->prefixlen < 4) {
2422 fib_info_put(res->fi);
2423 res->fi = NULL;
2424 }
2425 }
2426
2427
2428 rth = dst_alloc(&ipv4_dst_ops);
2429 if (!rth) {
2430 err = -ENOBUFS;
2431 goto cleanup;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002432 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002433
Julian Anastasovce723d82005-09-08 13:34:47 -07002434 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002435 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07002436 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002437 rth->u.dst.flags |= DST_NOXFRM;
Herbert Xu42f811b2007-06-04 23:34:44 -07002438 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002439 rth->u.dst.flags |= DST_NOPOLICY;
2440
2441 rth->fl.fl4_dst = oldflp->fl4_dst;
2442 rth->fl.fl4_tos = tos;
2443 rth->fl.fl4_src = oldflp->fl4_src;
2444 rth->fl.oif = oldflp->oif;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002445 rth->fl.mark = oldflp->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002446 rth->rt_dst = fl->fl4_dst;
2447 rth->rt_src = fl->fl4_src;
2448 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002449 /* get references to the devices that are to be hold by the routing
Linus Torvalds1da177e2005-04-16 15:20:36 -07002450 cache entry */
2451 rth->u.dst.dev = dev_out;
2452 dev_hold(dev_out);
2453 rth->idev = in_dev_get(dev_out);
2454 rth->rt_gateway = fl->fl4_dst;
2455 rth->rt_spec_dst= fl->fl4_src;
2456
2457 rth->u.dst.output=ip_output;
Timo Teräsd11a4dc2010-03-18 23:20:20 +00002458 rth->u.dst.obsolete = -1;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002459 rth->rt_genid = rt_genid(dev_net(dev_out));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002460
2461 RT_CACHE_STAT_INC(out_slow_tot);
2462
2463 if (flags & RTCF_LOCAL) {
2464 rth->u.dst.input = ip_local_deliver;
2465 rth->rt_spec_dst = fl->fl4_dst;
2466 }
2467 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2468 rth->rt_spec_dst = fl->fl4_src;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002469 if (flags & RTCF_LOCAL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002470 !(dev_out->flags & IFF_LOOPBACK)) {
2471 rth->u.dst.output = ip_mc_output;
2472 RT_CACHE_STAT_INC(out_slow_mc);
2473 }
2474#ifdef CONFIG_IP_MROUTE
2475 if (res->type == RTN_MULTICAST) {
2476 if (IN_DEV_MFORWARD(in_dev) &&
Joe Perchesf97c1e02007-12-16 13:45:43 -08002477 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002478 rth->u.dst.input = ip_mr_input;
2479 rth->u.dst.output = ip_mc_output;
2480 }
2481 }
2482#endif
2483 }
2484
2485 rt_set_nexthop(rth, res, 0);
2486
2487 rth->rt_flags = flags;
2488
2489 *result = rth;
2490 cleanup:
2491 /* release work reference to inet device */
2492 in_dev_put(in_dev);
2493
2494 return err;
2495}
2496
Stephen Hemminger5969f712008-04-10 01:52:09 -07002497static int ip_mkroute_output(struct rtable **rp,
2498 struct fib_result *res,
2499 const struct flowi *fl,
2500 const struct flowi *oldflp,
2501 struct net_device *dev_out,
2502 unsigned flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002503{
Chuck Short7abaa272005-06-22 22:10:23 -07002504 struct rtable *rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002505 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2506 unsigned hash;
2507 if (err == 0) {
Denis V. Lunevb00180d2008-07-05 19:04:09 -07002508 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002509 rt_genid(dev_net(dev_out)));
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00002510 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002511 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002512
Linus Torvalds1da177e2005-04-16 15:20:36 -07002513 return err;
2514}
2515
Linus Torvalds1da177e2005-04-16 15:20:36 -07002516/*
2517 * Major route resolver routine.
2518 */
2519
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002520static int ip_route_output_slow(struct net *net, struct rtable **rp,
2521 const struct flowi *oldflp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002522{
2523 u32 tos = RT_FL_TOS(oldflp);
2524 struct flowi fl = { .nl_u = { .ip4_u =
2525 { .daddr = oldflp->fl4_dst,
2526 .saddr = oldflp->fl4_src,
2527 .tos = tos & IPTOS_RT_MASK,
2528 .scope = ((tos & RTO_ONLINK) ?
2529 RT_SCOPE_LINK :
2530 RT_SCOPE_UNIVERSE),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002531 } },
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002532 .mark = oldflp->mark,
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002533 .iif = net->loopback_dev->ifindex,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002534 .oif = oldflp->oif };
2535 struct fib_result res;
2536 unsigned flags = 0;
2537 struct net_device *dev_out = NULL;
2538 int free_res = 0;
2539 int err;
2540
2541
2542 res.fi = NULL;
2543#ifdef CONFIG_IP_MULTIPLE_TABLES
2544 res.r = NULL;
2545#endif
2546
2547 if (oldflp->fl4_src) {
2548 err = -EINVAL;
Joe Perchesf97c1e02007-12-16 13:45:43 -08002549 if (ipv4_is_multicast(oldflp->fl4_src) ||
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002550 ipv4_is_lbcast(oldflp->fl4_src) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002551 ipv4_is_zeronet(oldflp->fl4_src))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002552 goto out;
2553
Linus Torvalds1da177e2005-04-16 15:20:36 -07002554 /* I removed check for oif == dev_out->oif here.
2555 It was wrong for two reasons:
Denis V. Lunev1ab35272008-01-22 22:04:30 -08002556 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2557 is assigned to multiple interfaces.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002558 2. Moreover, we are allowed to send packets with saddr
2559 of another iface. --ANK
2560 */
2561
Joe Perches9d4fb272009-11-23 10:41:23 -08002562 if (oldflp->oif == 0 &&
2563 (ipv4_is_multicast(oldflp->fl4_dst) ||
2564 oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
Julian Anastasova210d012008-10-01 07:28:28 -07002565 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2566 dev_out = ip_dev_find(net, oldflp->fl4_src);
2567 if (dev_out == NULL)
2568 goto out;
2569
Linus Torvalds1da177e2005-04-16 15:20:36 -07002570 /* Special hack: user can direct multicasts
2571 and limited broadcast via necessary interface
2572 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2573 This hack is not just for fun, it allows
2574 vic,vat and friends to work.
2575 They bind socket to loopback, set ttl to zero
2576 and expect that it will work.
2577 From the viewpoint of routing cache they are broken,
2578 because we are not allowed to build multicast path
2579 with loopback source addr (look, routing cache
2580 cannot know, that ttl is zero, so that packet
2581 will not leave this host and route is valid).
2582 Luckily, this hack is good workaround.
2583 */
2584
2585 fl.oif = dev_out->ifindex;
2586 goto make_route;
2587 }
Julian Anastasova210d012008-10-01 07:28:28 -07002588
2589 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2590 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2591 dev_out = ip_dev_find(net, oldflp->fl4_src);
2592 if (dev_out == NULL)
2593 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002594 dev_put(dev_out);
Julian Anastasova210d012008-10-01 07:28:28 -07002595 dev_out = NULL;
2596 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002597 }
2598
2599
2600 if (oldflp->oif) {
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002601 dev_out = dev_get_by_index(net, oldflp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002602 err = -ENODEV;
2603 if (dev_out == NULL)
2604 goto out;
Herbert Xue5ed6392005-10-03 14:35:55 -07002605
2606 /* RACE: Check return value of inet_select_addr instead. */
2607 if (__in_dev_get_rtnl(dev_out) == NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002608 dev_put(dev_out);
2609 goto out; /* Wrong error code */
2610 }
2611
Joe Perchesf97c1e02007-12-16 13:45:43 -08002612 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2613 oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002614 if (!fl.fl4_src)
2615 fl.fl4_src = inet_select_addr(dev_out, 0,
2616 RT_SCOPE_LINK);
2617 goto make_route;
2618 }
2619 if (!fl.fl4_src) {
Joe Perchesf97c1e02007-12-16 13:45:43 -08002620 if (ipv4_is_multicast(oldflp->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002621 fl.fl4_src = inet_select_addr(dev_out, 0,
2622 fl.fl4_scope);
2623 else if (!oldflp->fl4_dst)
2624 fl.fl4_src = inet_select_addr(dev_out, 0,
2625 RT_SCOPE_HOST);
2626 }
2627 }
2628
2629 if (!fl.fl4_dst) {
2630 fl.fl4_dst = fl.fl4_src;
2631 if (!fl.fl4_dst)
2632 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2633 if (dev_out)
2634 dev_put(dev_out);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002635 dev_out = net->loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002636 dev_hold(dev_out);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002637 fl.oif = net->loopback_dev->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002638 res.type = RTN_LOCAL;
2639 flags |= RTCF_LOCAL;
2640 goto make_route;
2641 }
2642
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002643 if (fib_lookup(net, &fl, &res)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002644 res.fi = NULL;
2645 if (oldflp->oif) {
2646 /* Apparently, routing tables are wrong. Assume,
2647 that the destination is on link.
2648
2649 WHY? DW.
2650 Because we are allowed to send to iface
2651 even if it has NO routes and NO assigned
2652 addresses. When oif is specified, routing
2653 tables are looked up with only one purpose:
2654 to catch if destination is gatewayed, rather than
2655 direct. Moreover, if MSG_DONTROUTE is set,
2656 we send packet, ignoring both routing tables
2657 and ifaddr state. --ANK
2658
2659
2660 We could make it even if oif is unknown,
2661 likely IPv6, but we do not.
2662 */
2663
2664 if (fl.fl4_src == 0)
2665 fl.fl4_src = inet_select_addr(dev_out, 0,
2666 RT_SCOPE_LINK);
2667 res.type = RTN_UNICAST;
2668 goto make_route;
2669 }
2670 if (dev_out)
2671 dev_put(dev_out);
2672 err = -ENETUNREACH;
2673 goto out;
2674 }
2675 free_res = 1;
2676
2677 if (res.type == RTN_LOCAL) {
2678 if (!fl.fl4_src)
2679 fl.fl4_src = fl.fl4_dst;
2680 if (dev_out)
2681 dev_put(dev_out);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002682 dev_out = net->loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002683 dev_hold(dev_out);
2684 fl.oif = dev_out->ifindex;
2685 if (res.fi)
2686 fib_info_put(res.fi);
2687 res.fi = NULL;
2688 flags |= RTCF_LOCAL;
2689 goto make_route;
2690 }
2691
2692#ifdef CONFIG_IP_ROUTE_MULTIPATH
2693 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2694 fib_select_multipath(&fl, &res);
2695 else
2696#endif
2697 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002698 fib_select_default(net, &fl, &res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002699
2700 if (!fl.fl4_src)
2701 fl.fl4_src = FIB_RES_PREFSRC(res);
2702
2703 if (dev_out)
2704 dev_put(dev_out);
2705 dev_out = FIB_RES_DEV(res);
2706 dev_hold(dev_out);
2707 fl.oif = dev_out->ifindex;
2708
2709
2710make_route:
2711 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2712
2713
2714 if (free_res)
2715 fib_res_put(&res);
2716 if (dev_out)
2717 dev_put(dev_out);
2718out: return err;
2719}
2720
Denis V. Lunev611c1832008-01-22 22:06:48 -08002721int __ip_route_output_key(struct net *net, struct rtable **rp,
2722 const struct flowi *flp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002723{
2724 unsigned hash;
2725 struct rtable *rth;
2726
Neil Horman1080d702008-10-27 12:28:25 -07002727 if (!rt_caching(net))
2728 goto slow_output;
2729
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002730 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002731
2732 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08002733 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2734 rth = rcu_dereference_bh(rth->u.dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002735 if (rth->fl.fl4_dst == flp->fl4_dst &&
2736 rth->fl.fl4_src == flp->fl4_src &&
2737 rth->fl.iif == 0 &&
2738 rth->fl.oif == flp->oif &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002739 rth->fl.mark == flp->mark &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002740 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
Denis V. Lunevb5921912008-01-22 23:50:25 -08002741 (IPTOS_RT_MASK | RTO_ONLINK)) &&
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09002742 net_eq(dev_net(rth->u.dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002743 !rt_is_expired(rth)) {
Pavel Emelyanov03f49f32007-11-10 21:28:34 -08002744 dst_use(&rth->u.dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002745 RT_CACHE_STAT_INC(out_hit);
2746 rcu_read_unlock_bh();
2747 *rp = rth;
2748 return 0;
2749 }
2750 RT_CACHE_STAT_INC(out_hlist_search);
2751 }
2752 rcu_read_unlock_bh();
2753
Neil Horman1080d702008-10-27 12:28:25 -07002754slow_output:
Denis V. Lunev611c1832008-01-22 22:06:48 -08002755 return ip_route_output_slow(net, rp, flp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002756}
2757
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002758EXPORT_SYMBOL_GPL(__ip_route_output_key);
2759
David S. Miller14e50e52007-05-24 18:17:54 -07002760static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2761{
2762}
2763
2764static struct dst_ops ipv4_dst_blackhole_ops = {
2765 .family = AF_INET,
Harvey Harrison09640e62009-02-01 00:45:17 -08002766 .protocol = cpu_to_be16(ETH_P_IP),
David S. Miller14e50e52007-05-24 18:17:54 -07002767 .destroy = ipv4_dst_destroy,
2768 .check = ipv4_dst_check,
2769 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
Eric Dumazete2422972008-01-30 20:07:45 -08002770 .entries = ATOMIC_INIT(0),
David S. Miller14e50e52007-05-24 18:17:54 -07002771};
2772
2773
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002774static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
David S. Miller14e50e52007-05-24 18:17:54 -07002775{
2776 struct rtable *ort = *rp;
2777 struct rtable *rt = (struct rtable *)
2778 dst_alloc(&ipv4_dst_blackhole_ops);
2779
2780 if (rt) {
2781 struct dst_entry *new = &rt->u.dst;
2782
2783 atomic_set(&new->__refcnt, 1);
2784 new->__use = 1;
Herbert Xu352e5122007-11-13 21:34:06 -08002785 new->input = dst_discard;
2786 new->output = dst_discard;
David S. Miller14e50e52007-05-24 18:17:54 -07002787 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2788
2789 new->dev = ort->u.dst.dev;
2790 if (new->dev)
2791 dev_hold(new->dev);
2792
2793 rt->fl = ort->fl;
2794
2795 rt->idev = ort->idev;
2796 if (rt->idev)
2797 in_dev_hold(rt->idev);
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002798 rt->rt_genid = rt_genid(net);
David S. Miller14e50e52007-05-24 18:17:54 -07002799 rt->rt_flags = ort->rt_flags;
2800 rt->rt_type = ort->rt_type;
2801 rt->rt_dst = ort->rt_dst;
2802 rt->rt_src = ort->rt_src;
2803 rt->rt_iif = ort->rt_iif;
2804 rt->rt_gateway = ort->rt_gateway;
2805 rt->rt_spec_dst = ort->rt_spec_dst;
2806 rt->peer = ort->peer;
2807 if (rt->peer)
2808 atomic_inc(&rt->peer->refcnt);
2809
2810 dst_free(new);
2811 }
2812
2813 dst_release(&(*rp)->u.dst);
2814 *rp = rt;
2815 return (rt ? 0 : -ENOMEM);
2816}
2817
Denis V. Lunevf1b050b2008-01-22 22:07:10 -08002818int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2819 struct sock *sk, int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002820{
2821 int err;
2822
Denis V. Lunevf1b050b2008-01-22 22:07:10 -08002823 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002824 return err;
2825
2826 if (flp->proto) {
2827 if (!flp->fl4_src)
2828 flp->fl4_src = (*rp)->rt_src;
2829 if (!flp->fl4_dst)
2830 flp->fl4_dst = (*rp)->rt_dst;
Alexey Dobriyan52479b62008-11-25 17:35:18 -08002831 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
Herbert Xubb728452007-12-12 18:48:58 -08002832 flags ? XFRM_LOOKUP_WAIT : 0);
David S. Miller14e50e52007-05-24 18:17:54 -07002833 if (err == -EREMOTE)
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002834 err = ipv4_dst_blackhole(net, rp, flp);
David S. Miller14e50e52007-05-24 18:17:54 -07002835
2836 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002837 }
2838
2839 return 0;
2840}
2841
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002842EXPORT_SYMBOL_GPL(ip_route_output_flow);
2843
Denis V. Lunevf2063512008-01-22 22:07:34 -08002844int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002845{
Denis V. Lunevf2063512008-01-22 22:07:34 -08002846 return ip_route_output_flow(net, rp, flp, NULL, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002847}
2848
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002849static int rt_fill_info(struct net *net,
2850 struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002851 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002852{
Eric Dumazet511c3f92009-06-02 05:14:27 +00002853 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002854 struct rtmsg *r;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002855 struct nlmsghdr *nlh;
Thomas Grafe3703b32006-11-27 09:27:07 -08002856 long expires;
2857 u32 id = 0, ts = 0, tsage = 0, error;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002858
2859 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2860 if (nlh == NULL)
Patrick McHardy26932562007-01-31 23:16:40 -08002861 return -EMSGSIZE;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002862
2863 r = nlmsg_data(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002864 r->rtm_family = AF_INET;
2865 r->rtm_dst_len = 32;
2866 r->rtm_src_len = 0;
2867 r->rtm_tos = rt->fl.fl4_tos;
2868 r->rtm_table = RT_TABLE_MAIN;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002869 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002870 r->rtm_type = rt->rt_type;
2871 r->rtm_scope = RT_SCOPE_UNIVERSE;
2872 r->rtm_protocol = RTPROT_UNSPEC;
2873 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2874 if (rt->rt_flags & RTCF_NOTIFY)
2875 r->rtm_flags |= RTM_F_NOTIFY;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002876
Al Viro17fb2c62006-09-26 22:15:25 -07002877 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002878
Linus Torvalds1da177e2005-04-16 15:20:36 -07002879 if (rt->fl.fl4_src) {
2880 r->rtm_src_len = 32;
Al Viro17fb2c62006-09-26 22:15:25 -07002881 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002882 }
2883 if (rt->u.dst.dev)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002884 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002885#ifdef CONFIG_NET_CLS_ROUTE
2886 if (rt->u.dst.tclassid)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002887 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002888#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002889 if (rt->fl.iif)
Al Viro17fb2c62006-09-26 22:15:25 -07002890 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002891 else if (rt->rt_src != rt->fl.fl4_src)
Al Viro17fb2c62006-09-26 22:15:25 -07002892 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002893
Linus Torvalds1da177e2005-04-16 15:20:36 -07002894 if (rt->rt_dst != rt->rt_gateway)
Al Viro17fb2c62006-09-26 22:15:25 -07002895 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002896
Linus Torvalds1da177e2005-04-16 15:20:36 -07002897 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002898 goto nla_put_failure;
2899
Thomas Grafe3703b32006-11-27 09:27:07 -08002900 error = rt->u.dst.error;
2901 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002902 if (rt->peer) {
Eric Dumazet2c1409a2009-11-12 09:33:09 +00002903 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002904 if (rt->peer->tcp_ts_stamp) {
Thomas Grafe3703b32006-11-27 09:27:07 -08002905 ts = rt->peer->tcp_ts;
James Morris9d729f72007-03-04 16:12:44 -08002906 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002907 }
2908 }
Thomas Grafbe403ea2006-08-17 18:15:17 -07002909
Linus Torvalds1da177e2005-04-16 15:20:36 -07002910 if (rt->fl.iif) {
2911#ifdef CONFIG_IP_MROUTE
Al Viroe4485152006-09-26 22:15:01 -07002912 __be32 dst = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002913
Joe Perchesf97c1e02007-12-16 13:45:43 -08002914 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002915 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2916 int err = ipmr_get_route(net, skb, r, nowait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002917 if (err <= 0) {
2918 if (!nowait) {
2919 if (err == 0)
2920 return 0;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002921 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002922 } else {
2923 if (err == -EMSGSIZE)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002924 goto nla_put_failure;
Thomas Grafe3703b32006-11-27 09:27:07 -08002925 error = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002926 }
2927 }
2928 } else
2929#endif
Thomas Grafbe403ea2006-08-17 18:15:17 -07002930 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002931 }
2932
Thomas Grafe3703b32006-11-27 09:27:07 -08002933 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2934 expires, error) < 0)
2935 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002936
Thomas Grafbe403ea2006-08-17 18:15:17 -07002937 return nlmsg_end(skb, nlh);
2938
2939nla_put_failure:
Patrick McHardy26932562007-01-31 23:16:40 -08002940 nlmsg_cancel(skb, nlh);
2941 return -EMSGSIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002942}
2943
Thomas Graf63f34442007-03-22 11:55:17 -07002944static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002945{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09002946 struct net *net = sock_net(in_skb->sk);
Thomas Grafd889ce32006-08-17 18:15:44 -07002947 struct rtmsg *rtm;
2948 struct nlattr *tb[RTA_MAX+1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07002949 struct rtable *rt = NULL;
Al Viro9e12bb22006-09-26 21:25:20 -07002950 __be32 dst = 0;
2951 __be32 src = 0;
2952 u32 iif;
Thomas Grafd889ce32006-08-17 18:15:44 -07002953 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002954 struct sk_buff *skb;
2955
Thomas Grafd889ce32006-08-17 18:15:44 -07002956 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2957 if (err < 0)
2958 goto errout;
2959
2960 rtm = nlmsg_data(nlh);
2961
Linus Torvalds1da177e2005-04-16 15:20:36 -07002962 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
Thomas Grafd889ce32006-08-17 18:15:44 -07002963 if (skb == NULL) {
2964 err = -ENOBUFS;
2965 goto errout;
2966 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002967
2968 /* Reserve room for dummy headers, this skb can pass
2969 through good chunk of routing engine.
2970 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07002971 skb_reset_mac_header(skb);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07002972 skb_reset_network_header(skb);
Stephen Hemmingerd2c962b2006-04-17 17:27:11 -07002973
2974 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07002975 ip_hdr(skb)->protocol = IPPROTO_ICMP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002976 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2977
Al Viro17fb2c62006-09-26 22:15:25 -07002978 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2979 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
Thomas Grafd889ce32006-08-17 18:15:44 -07002980 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002981
2982 if (iif) {
Thomas Grafd889ce32006-08-17 18:15:44 -07002983 struct net_device *dev;
2984
Denis V. Lunev19375042008-02-28 20:52:04 -08002985 dev = __dev_get_by_index(net, iif);
Thomas Grafd889ce32006-08-17 18:15:44 -07002986 if (dev == NULL) {
2987 err = -ENODEV;
2988 goto errout_free;
2989 }
2990
Linus Torvalds1da177e2005-04-16 15:20:36 -07002991 skb->protocol = htons(ETH_P_IP);
2992 skb->dev = dev;
2993 local_bh_disable();
2994 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2995 local_bh_enable();
Thomas Grafd889ce32006-08-17 18:15:44 -07002996
Eric Dumazet511c3f92009-06-02 05:14:27 +00002997 rt = skb_rtable(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07002998 if (err == 0 && rt->u.dst.error)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002999 err = -rt->u.dst.error;
3000 } else {
Thomas Grafd889ce32006-08-17 18:15:44 -07003001 struct flowi fl = {
3002 .nl_u = {
3003 .ip4_u = {
3004 .daddr = dst,
3005 .saddr = src,
3006 .tos = rtm->rtm_tos,
3007 },
3008 },
3009 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3010 };
Denis V. Lunev19375042008-02-28 20:52:04 -08003011 err = ip_route_output_key(net, &rt, &fl);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003012 }
Thomas Grafd889ce32006-08-17 18:15:44 -07003013
Linus Torvalds1da177e2005-04-16 15:20:36 -07003014 if (err)
Thomas Grafd889ce32006-08-17 18:15:44 -07003015 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003016
Eric Dumazetadf30902009-06-02 05:19:30 +00003017 skb_dst_set(skb, &rt->u.dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003018 if (rtm->rtm_flags & RTM_F_NOTIFY)
3019 rt->rt_flags |= RTCF_NOTIFY;
3020
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003021 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Denis V. Lunev19375042008-02-28 20:52:04 -08003022 RTM_NEWROUTE, 0, 0);
Thomas Grafd889ce32006-08-17 18:15:44 -07003023 if (err <= 0)
3024 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003025
Denis V. Lunev19375042008-02-28 20:52:04 -08003026 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
Thomas Grafd889ce32006-08-17 18:15:44 -07003027errout:
Thomas Graf2942e902006-08-15 00:30:25 -07003028 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003029
Thomas Grafd889ce32006-08-17 18:15:44 -07003030errout_free:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003031 kfree_skb(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07003032 goto errout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003033}
3034
3035int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3036{
3037 struct rtable *rt;
3038 int h, s_h;
3039 int idx, s_idx;
Denis V. Lunev19375042008-02-28 20:52:04 -08003040 struct net *net;
3041
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09003042 net = sock_net(skb->sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003043
3044 s_h = cb->args[0];
Eric Dumazetd8c92832008-01-07 21:52:14 -08003045 if (s_h < 0)
3046 s_h = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003047 s_idx = idx = cb->args[1];
Eric Dumazeta6272662008-08-28 01:11:25 -07003048 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3049 if (!rt_hash_table[h].chain)
3050 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003051 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08003052 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3053 rt = rcu_dereference_bh(rt->u.dst.rt_next), idx++) {
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09003054 if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003055 continue;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07003056 if (rt_is_expired(rt))
Eric Dumazet29e75252008-01-31 17:05:09 -08003057 continue;
Eric Dumazetadf30902009-06-02 05:19:30 +00003058 skb_dst_set(skb, dst_clone(&rt->u.dst));
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003059 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003060 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07003061 1, NLM_F_MULTI) <= 0) {
Eric Dumazetadf30902009-06-02 05:19:30 +00003062 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003063 rcu_read_unlock_bh();
3064 goto done;
3065 }
Eric Dumazetadf30902009-06-02 05:19:30 +00003066 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003067 }
3068 rcu_read_unlock_bh();
3069 }
3070
3071done:
3072 cb->args[0] = h;
3073 cb->args[1] = idx;
3074 return skb->len;
3075}
3076
3077void ip_rt_multicast_event(struct in_device *in_dev)
3078{
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -07003079 rt_cache_flush(dev_net(in_dev->dev), 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003080}
3081
3082#ifdef CONFIG_SYSCTL
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003083static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003084 void __user *buffer,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003085 size_t *lenp, loff_t *ppos)
3086{
3087 if (write) {
Denis V. Lunev639e1042008-07-05 19:02:06 -07003088 int flush_delay;
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003089 ctl_table ctl;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003090 struct net *net;
Denis V. Lunev639e1042008-07-05 19:02:06 -07003091
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003092 memcpy(&ctl, __ctl, sizeof(ctl));
3093 ctl.data = &flush_delay;
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003094 proc_dointvec(&ctl, write, buffer, lenp, ppos);
Denis V. Lunev639e1042008-07-05 19:02:06 -07003095
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003096 net = (struct net *)__ctl->extra1;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003097 rt_cache_flush(net, flush_delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003098 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003099 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003100
3101 return -EINVAL;
3102}
3103
Herbert Xuc6153b52008-08-15 13:44:31 -07003104static void rt_secret_reschedule(int old)
3105{
3106 struct net *net;
3107 int new = ip_rt_secret_interval;
3108 int diff = new - old;
3109
3110 if (!diff)
3111 return;
3112
3113 rtnl_lock();
3114 for_each_net(net) {
3115 int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
Vitaliy Gusev858a18a2010-03-16 01:07:51 +00003116 long time;
Herbert Xuc6153b52008-08-15 13:44:31 -07003117
3118 if (!new)
3119 continue;
3120
3121 if (deleted) {
Vitaliy Gusev858a18a2010-03-16 01:07:51 +00003122 time = net->ipv4.rt_secret_timer.expires - jiffies;
Herbert Xuc6153b52008-08-15 13:44:31 -07003123
3124 if (time <= 0 || (time += diff) <= 0)
3125 time = 0;
Herbert Xuc6153b52008-08-15 13:44:31 -07003126 } else
Vitaliy Gusev858a18a2010-03-16 01:07:51 +00003127 time = new;
Herbert Xuc6153b52008-08-15 13:44:31 -07003128
Vitaliy Gusev858a18a2010-03-16 01:07:51 +00003129 mod_timer(&net->ipv4.rt_secret_timer, jiffies + time);
Herbert Xuc6153b52008-08-15 13:44:31 -07003130 }
3131 rtnl_unlock();
3132}
3133
3134static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
Herbert Xuc6153b52008-08-15 13:44:31 -07003135 void __user *buffer, size_t *lenp,
3136 loff_t *ppos)
3137{
3138 int old = ip_rt_secret_interval;
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003139 int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
Herbert Xuc6153b52008-08-15 13:44:31 -07003140
3141 rt_secret_reschedule(old);
3142
3143 return ret;
3144}
3145
Al Viroeeb61f72008-07-27 08:59:33 +01003146static ctl_table ipv4_route_table[] = {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003147 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003148 .procname = "gc_thresh",
3149 .data = &ipv4_dst_ops.gc_thresh,
3150 .maxlen = sizeof(int),
3151 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003152 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003153 },
3154 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003155 .procname = "max_size",
3156 .data = &ip_rt_max_size,
3157 .maxlen = sizeof(int),
3158 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003159 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003160 },
3161 {
3162 /* Deprecated. Use gc_min_interval_ms */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003163
Linus Torvalds1da177e2005-04-16 15:20:36 -07003164 .procname = "gc_min_interval",
3165 .data = &ip_rt_gc_min_interval,
3166 .maxlen = sizeof(int),
3167 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003168 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003169 },
3170 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003171 .procname = "gc_min_interval_ms",
3172 .data = &ip_rt_gc_min_interval,
3173 .maxlen = sizeof(int),
3174 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003175 .proc_handler = proc_dointvec_ms_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003176 },
3177 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003178 .procname = "gc_timeout",
3179 .data = &ip_rt_gc_timeout,
3180 .maxlen = sizeof(int),
3181 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003182 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003183 },
3184 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003185 .procname = "gc_interval",
3186 .data = &ip_rt_gc_interval,
3187 .maxlen = sizeof(int),
3188 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003189 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003190 },
3191 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003192 .procname = "redirect_load",
3193 .data = &ip_rt_redirect_load,
3194 .maxlen = sizeof(int),
3195 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003196 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003197 },
3198 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003199 .procname = "redirect_number",
3200 .data = &ip_rt_redirect_number,
3201 .maxlen = sizeof(int),
3202 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003203 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003204 },
3205 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003206 .procname = "redirect_silence",
3207 .data = &ip_rt_redirect_silence,
3208 .maxlen = sizeof(int),
3209 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003210 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003211 },
3212 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003213 .procname = "error_cost",
3214 .data = &ip_rt_error_cost,
3215 .maxlen = sizeof(int),
3216 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003217 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003218 },
3219 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003220 .procname = "error_burst",
3221 .data = &ip_rt_error_burst,
3222 .maxlen = sizeof(int),
3223 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003224 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003225 },
3226 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003227 .procname = "gc_elasticity",
3228 .data = &ip_rt_gc_elasticity,
3229 .maxlen = sizeof(int),
3230 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003231 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003232 },
3233 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003234 .procname = "mtu_expires",
3235 .data = &ip_rt_mtu_expires,
3236 .maxlen = sizeof(int),
3237 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003238 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003239 },
3240 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003241 .procname = "min_pmtu",
3242 .data = &ip_rt_min_pmtu,
3243 .maxlen = sizeof(int),
3244 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003245 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003246 },
3247 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003248 .procname = "min_adv_mss",
3249 .data = &ip_rt_min_advmss,
3250 .maxlen = sizeof(int),
3251 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003252 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003253 },
3254 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003255 .procname = "secret_interval",
3256 .data = &ip_rt_secret_interval,
3257 .maxlen = sizeof(int),
3258 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003259 .proc_handler = ipv4_sysctl_rt_secret_interval,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003260 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003261 { }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003262};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003263
Al Viro2f4520d2008-08-25 15:17:44 -07003264static struct ctl_table empty[1];
3265
3266static struct ctl_table ipv4_skeleton[] =
3267{
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003268 { .procname = "route",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003269 .mode = 0555, .child = ipv4_route_table},
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003270 { .procname = "neigh",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003271 .mode = 0555, .child = empty},
Al Viro2f4520d2008-08-25 15:17:44 -07003272 { }
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003273};
3274
Al Viro2f4520d2008-08-25 15:17:44 -07003275static __net_initdata struct ctl_path ipv4_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003276 { .procname = "net", },
3277 { .procname = "ipv4", },
Al Viro2f4520d2008-08-25 15:17:44 -07003278 { },
3279};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003280
3281static struct ctl_table ipv4_route_flush_table[] = {
3282 {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003283 .procname = "flush",
3284 .maxlen = sizeof(int),
3285 .mode = 0200,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003286 .proc_handler = ipv4_sysctl_rtcache_flush,
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003287 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003288 { },
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003289};
3290
Al Viro2f4520d2008-08-25 15:17:44 -07003291static __net_initdata struct ctl_path ipv4_route_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003292 { .procname = "net", },
3293 { .procname = "ipv4", },
3294 { .procname = "route", },
Al Viro2f4520d2008-08-25 15:17:44 -07003295 { },
3296};
3297
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003298static __net_init int sysctl_route_net_init(struct net *net)
3299{
3300 struct ctl_table *tbl;
3301
3302 tbl = ipv4_route_flush_table;
Octavian Purdila09ad9bc2009-11-25 15:14:13 -08003303 if (!net_eq(net, &init_net)) {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003304 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3305 if (tbl == NULL)
3306 goto err_dup;
3307 }
3308 tbl[0].extra1 = net;
3309
3310 net->ipv4.route_hdr =
3311 register_net_sysctl_table(net, ipv4_route_path, tbl);
3312 if (net->ipv4.route_hdr == NULL)
3313 goto err_reg;
3314 return 0;
3315
3316err_reg:
3317 if (tbl != ipv4_route_flush_table)
3318 kfree(tbl);
3319err_dup:
3320 return -ENOMEM;
3321}
3322
3323static __net_exit void sysctl_route_net_exit(struct net *net)
3324{
3325 struct ctl_table *tbl;
3326
3327 tbl = net->ipv4.route_hdr->ctl_table_arg;
3328 unregister_net_sysctl_table(net->ipv4.route_hdr);
3329 BUG_ON(tbl == ipv4_route_flush_table);
3330 kfree(tbl);
3331}
3332
3333static __net_initdata struct pernet_operations sysctl_route_ops = {
3334 .init = sysctl_route_net_init,
3335 .exit = sysctl_route_net_exit,
3336};
Linus Torvalds1da177e2005-04-16 15:20:36 -07003337#endif
3338
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003339
3340static __net_init int rt_secret_timer_init(struct net *net)
3341{
Denis V. Luneve84f84f2008-07-05 19:04:32 -07003342 atomic_set(&net->ipv4.rt_genid,
3343 (int) ((num_physpages ^ (num_physpages>>8)) ^
3344 (jiffies ^ (jiffies >> 7))));
3345
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003346 net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3347 net->ipv4.rt_secret_timer.data = (unsigned long)net;
3348 init_timer_deferrable(&net->ipv4.rt_secret_timer);
3349
Herbert Xuc6153b52008-08-15 13:44:31 -07003350 if (ip_rt_secret_interval) {
3351 net->ipv4.rt_secret_timer.expires =
3352 jiffies + net_random() % ip_rt_secret_interval +
3353 ip_rt_secret_interval;
3354 add_timer(&net->ipv4.rt_secret_timer);
3355 }
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003356 return 0;
3357}
3358
3359static __net_exit void rt_secret_timer_exit(struct net *net)
3360{
3361 del_timer_sync(&net->ipv4.rt_secret_timer);
3362}
3363
3364static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3365 .init = rt_secret_timer_init,
3366 .exit = rt_secret_timer_exit,
3367};
3368
3369
Linus Torvalds1da177e2005-04-16 15:20:36 -07003370#ifdef CONFIG_NET_CLS_ROUTE
Tejun Heo7d720c32010-02-16 15:20:26 +00003371struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003372#endif /* CONFIG_NET_CLS_ROUTE */
3373
3374static __initdata unsigned long rhash_entries;
3375static int __init set_rhash_entries(char *str)
3376{
3377 if (!str)
3378 return 0;
3379 rhash_entries = simple_strtoul(str, &str, 0);
3380 return 1;
3381}
3382__setup("rhash_entries=", set_rhash_entries);
3383
3384int __init ip_rt_init(void)
3385{
Eric Dumazet424c4b72005-07-05 14:58:19 -07003386 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003387
Linus Torvalds1da177e2005-04-16 15:20:36 -07003388#ifdef CONFIG_NET_CLS_ROUTE
Ingo Molnar0dcec8c2009-02-25 14:07:33 +01003389 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003390 if (!ip_rt_acct)
3391 panic("IP: failed to allocate ip_rt_acct\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003392#endif
3393
Alexey Dobriyane5d679f332006-08-26 19:25:52 -07003394 ipv4_dst_ops.kmem_cachep =
3395 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09003396 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003397
David S. Miller14e50e52007-05-24 18:17:54 -07003398 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3399
Eric Dumazet424c4b72005-07-05 14:58:19 -07003400 rt_hash_table = (struct rt_hash_bucket *)
3401 alloc_large_system_hash("IP route cache",
3402 sizeof(struct rt_hash_bucket),
3403 rhash_entries,
Jan Beulich44813742009-09-21 17:03:05 -07003404 (totalram_pages >= 128 * 1024) ?
Mike Stroyan18955cf2005-11-29 16:12:55 -08003405 15 : 17,
Kirill Korotaev8d1502d2006-08-07 20:44:22 -07003406 0,
Eric Dumazet424c4b72005-07-05 14:58:19 -07003407 &rt_hash_log,
3408 &rt_hash_mask,
Anton Blanchardc9503e02009-04-27 05:42:24 -07003409 rhash_entries ? 0 : 512 * 1024);
Eric Dumazet22c047c2005-07-05 14:55:24 -07003410 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3411 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003412
3413 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3414 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3415
Linus Torvalds1da177e2005-04-16 15:20:36 -07003416 devinet_init();
3417 ip_fib_init();
3418
Linus Torvalds1da177e2005-04-16 15:20:36 -07003419 /* All the timers, started at system startup tend
3420 to synchronize. Perturb it a bit.
3421 */
Eric Dumazet125bb8f2009-06-11 20:10:07 +00003422 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3423 expires_ljiffies = jiffies;
Eric Dumazet39c90ec2007-09-15 10:55:54 -07003424 schedule_delayed_work(&expires_work,
3425 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003426
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003427 if (register_pernet_subsys(&rt_secret_timer_ops))
3428 printk(KERN_ERR "Unable to setup rt_secret_timer\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003429
Denis V. Lunev73b38712008-02-28 20:51:18 -08003430 if (ip_rt_proc_init())
Pavel Emelyanov107f1632007-12-05 21:14:28 -08003431 printk(KERN_ERR "Unable to create route proc files\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003432#ifdef CONFIG_XFRM
3433 xfrm_init();
Neil Hormana33bc5c2009-07-30 18:52:15 -07003434 xfrm4_init(ip_rt_max_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003435#endif
Thomas Graf63f34442007-03-22 11:55:17 -07003436 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3437
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003438#ifdef CONFIG_SYSCTL
3439 register_pernet_subsys(&sysctl_route_ops);
3440#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07003441 return rc;
3442}
3443
Al Viroa1bc6eb2008-07-30 06:32:52 -04003444#ifdef CONFIG_SYSCTL
Al Viroeeb61f72008-07-27 08:59:33 +01003445/*
3446 * We really need to sanitize the damn ipv4 init order, then all
3447 * this nonsense will go away.
3448 */
3449void __init ip_static_sysctl_init(void)
3450{
Al Viro2f4520d2008-08-25 15:17:44 -07003451 register_sysctl_paths(ipv4_path, ipv4_skeleton);
Al Viroeeb61f72008-07-27 08:59:33 +01003452}
Al Viroa1bc6eb2008-07-30 06:32:52 -04003453#endif
Al Viroeeb61f72008-07-27 08:59:33 +01003454
Linus Torvalds1da177e2005-04-16 15:20:36 -07003455EXPORT_SYMBOL(__ip_select_ident);
3456EXPORT_SYMBOL(ip_route_input);
3457EXPORT_SYMBOL(ip_route_output_key);