blob: 90cdcfc32937358d1d12da545525437704cf4492 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
Jesper Juhl02c30a82005-05-05 16:16:16 -07008 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -07009 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090021 * Alan Cox : Super /proc >4K
Linus Torvalds1da177e2005-04-16 15:20:36 -070022 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090039 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070055 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Ilia Sotnikovcef26852006-03-25 01:38:55 -080056 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
Linus Torvalds1da177e2005-04-16 15:20:36 -070058 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
Linus Torvalds1da177e2005-04-16 15:20:36 -070065#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070071#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070072#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070073#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
Eric Dumazet39c90ec2007-09-15 10:55:54 -070082#include <linux/workqueue.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070083#include <linux/skbuff.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070084#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
Herbert Xu352e5122007-11-13 21:34:06 -080093#include <net/dst.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020094#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070095#include <net/protocol.h>
96#include <net/ip.h>
97#include <net/route.h>
98#include <net/inetpeer.h>
99#include <net/sock.h>
100#include <net/ip_fib.h>
101#include <net/arp.h>
102#include <net/tcp.h>
103#include <net/icmp.h>
104#include <net/xfrm.h>
Tom Tucker8d717402006-07-30 20:43:36 -0700105#include <net/netevent.h>
Thomas Graf63f34442007-03-22 11:55:17 -0700106#include <net/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700107#ifdef CONFIG_SYSCTL
108#include <linux/sysctl.h>
109#endif
110
111#define RT_FL_TOS(oldflp) \
112 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113
114#define IP_MAX_MTU 0xFFF0
115
116#define RT_GC_TIMEOUT (300*HZ)
117
Linus Torvalds1da177e2005-04-16 15:20:36 -0700118static int ip_rt_max_size;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700119static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
120static int ip_rt_gc_interval __read_mostly = 60 * HZ;
121static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
122static int ip_rt_redirect_number __read_mostly = 9;
123static int ip_rt_redirect_load __read_mostly = HZ / 50;
124static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
125static int ip_rt_error_cost __read_mostly = HZ;
126static int ip_rt_error_burst __read_mostly = 5 * HZ;
127static int ip_rt_gc_elasticity __read_mostly = 8;
128static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
129static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
130static int ip_rt_min_advmss __read_mostly = 256;
131static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
Neil Horman1080d702008-10-27 12:28:25 -0700132static int rt_chain_length_max __read_mostly = 20;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700133
Eric Dumazet125bb8f2009-06-11 20:10:07 +0000134static struct delayed_work expires_work;
135static unsigned long expires_ljiffies;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700136
137/*
138 * Interface to generic destination cache.
139 */
140
141static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142static void ipv4_dst_destroy(struct dst_entry *dst);
143static void ipv4_dst_ifdown(struct dst_entry *dst,
144 struct net_device *dev, int how);
145static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146static void ipv4_link_failure(struct sk_buff *skb);
147static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
Daniel Lezcano569d3642008-01-18 03:56:57 -0800148static int rt_garbage_collect(struct dst_ops *ops);
Neil Horman1080d702008-10-27 12:28:25 -0700149static void rt_emergency_hash_rebuild(struct net *net);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700150
151
152static struct dst_ops ipv4_dst_ops = {
153 .family = AF_INET,
Harvey Harrison09640e62009-02-01 00:45:17 -0800154 .protocol = cpu_to_be16(ETH_P_IP),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700155 .gc = rt_garbage_collect,
156 .check = ipv4_dst_check,
157 .destroy = ipv4_dst_destroy,
158 .ifdown = ipv4_dst_ifdown,
159 .negative_advice = ipv4_negative_advice,
160 .link_failure = ipv4_link_failure,
161 .update_pmtu = ip_rt_update_pmtu,
Herbert Xu1ac06e02008-05-20 14:32:14 -0700162 .local_out = __ip_local_out,
Eric Dumazete2422972008-01-30 20:07:45 -0800163 .entries = ATOMIC_INIT(0),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700164};
165
166#define ECN_OR_COST(class) TC_PRIO_##class
167
Philippe De Muyter4839c522007-07-09 15:32:57 -0700168const __u8 ip_tos2prio[16] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700169 TC_PRIO_BESTEFFORT,
170 ECN_OR_COST(FILLER),
171 TC_PRIO_BESTEFFORT,
172 ECN_OR_COST(BESTEFFORT),
173 TC_PRIO_BULK,
174 ECN_OR_COST(BULK),
175 TC_PRIO_BULK,
176 ECN_OR_COST(BULK),
177 TC_PRIO_INTERACTIVE,
178 ECN_OR_COST(INTERACTIVE),
179 TC_PRIO_INTERACTIVE,
180 ECN_OR_COST(INTERACTIVE),
181 TC_PRIO_INTERACTIVE_BULK,
182 ECN_OR_COST(INTERACTIVE_BULK),
183 TC_PRIO_INTERACTIVE_BULK,
184 ECN_OR_COST(INTERACTIVE_BULK)
185};
186
187
188/*
189 * Route cache.
190 */
191
192/* The locking scheme is rather straight forward:
193 *
194 * 1) Read-Copy Update protects the buckets of the central route hash.
195 * 2) Only writers remove entries, and they hold the lock
196 * as they look at rtable reference counts.
197 * 3) Only readers acquire references to rtable entries,
198 * they do so with atomic increments and with the
199 * lock held.
200 */
201
202struct rt_hash_bucket {
203 struct rtable *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700204};
Neil Horman1080d702008-10-27 12:28:25 -0700205
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700206#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
207 defined(CONFIG_PROVE_LOCKING)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700208/*
209 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210 * The size of this table is a power of two and depends on the number of CPUS.
Ingo Molnar62051202006-07-03 00:24:59 -0700211 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700212 */
Ingo Molnar62051202006-07-03 00:24:59 -0700213#ifdef CONFIG_LOCKDEP
214# define RT_HASH_LOCK_SZ 256
Eric Dumazet22c047c2005-07-05 14:55:24 -0700215#else
Ingo Molnar62051202006-07-03 00:24:59 -0700216# if NR_CPUS >= 32
217# define RT_HASH_LOCK_SZ 4096
218# elif NR_CPUS >= 16
219# define RT_HASH_LOCK_SZ 2048
220# elif NR_CPUS >= 8
221# define RT_HASH_LOCK_SZ 1024
222# elif NR_CPUS >= 4
223# define RT_HASH_LOCK_SZ 512
224# else
225# define RT_HASH_LOCK_SZ 256
226# endif
Eric Dumazet22c047c2005-07-05 14:55:24 -0700227#endif
228
229static spinlock_t *rt_hash_locks;
230# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800231
232static __init void rt_hash_lock_init(void)
233{
234 int i;
235
236 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
237 GFP_KERNEL);
238 if (!rt_hash_locks)
239 panic("IP: failed to allocate rt_hash_locks\n");
240
241 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
242 spin_lock_init(&rt_hash_locks[i]);
243}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700244#else
245# define rt_hash_lock_addr(slot) NULL
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800246
247static inline void rt_hash_lock_init(void)
248{
249}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700250#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700251
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700252static struct rt_hash_bucket *rt_hash_table __read_mostly;
253static unsigned rt_hash_mask __read_mostly;
254static unsigned int rt_hash_log __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700255
Eric Dumazet2f970d82006-01-17 02:54:36 -0800256static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
Andrew Mortondbd29152006-01-17 21:58:01 -0800257#define RT_CACHE_STAT_INC(field) \
Paul Mackerrasbfe5d832006-06-25 05:47:14 -0700258 (__raw_get_cpu_var(rt_cache_stat).field++)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700259
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700260static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
261 int genid)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700262{
Stephen Hemminger1294fc42008-04-10 01:54:01 -0700263 return jhash_3words((__force u32)(__be32)(daddr),
264 (__force u32)(__be32)(saddr),
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700265 idx, genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800266 & rt_hash_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700267}
268
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700269static inline int rt_genid(struct net *net)
270{
271 return atomic_read(&net->ipv4.rt_genid);
272}
273
Linus Torvalds1da177e2005-04-16 15:20:36 -0700274#ifdef CONFIG_PROC_FS
275struct rt_cache_iter_state {
Denis V. Luneva75e9362008-02-28 20:50:55 -0800276 struct seq_net_private p;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700277 int bucket;
Eric Dumazet29e75252008-01-31 17:05:09 -0800278 int genid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700279};
280
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900281static struct rtable *rt_cache_get_first(struct seq_file *seq)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700282{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900283 struct rt_cache_iter_state *st = seq->private;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700284 struct rtable *r = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700285
286 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
Eric Dumazeta6272662008-08-28 01:11:25 -0700287 if (!rt_hash_table[st->bucket].chain)
288 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700289 rcu_read_lock_bh();
Eric Dumazet29e75252008-01-31 17:05:09 -0800290 r = rcu_dereference(rt_hash_table[st->bucket].chain);
291 while (r) {
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900292 if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
Denis V. Luneva75e9362008-02-28 20:50:55 -0800293 r->rt_genid == st->genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800294 return r;
295 r = rcu_dereference(r->u.dst.rt_next);
296 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700297 rcu_read_unlock_bh();
298 }
Eric Dumazet29e75252008-01-31 17:05:09 -0800299 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700300}
301
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900302static struct rtable *__rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800303 struct rtable *r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700304{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900305 struct rt_cache_iter_state *st = seq->private;
Eric Dumazeta6272662008-08-28 01:11:25 -0700306
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800307 r = r->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700308 while (!r) {
309 rcu_read_unlock_bh();
Eric Dumazeta6272662008-08-28 01:11:25 -0700310 do {
311 if (--st->bucket < 0)
312 return NULL;
313 } while (!rt_hash_table[st->bucket].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700314 rcu_read_lock_bh();
315 r = rt_hash_table[st->bucket].chain;
316 }
Eric Dumazet0bccead2008-01-10 03:55:57 -0800317 return rcu_dereference(r);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700318}
319
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900320static struct rtable *rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800321 struct rtable *r)
322{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900323 struct rt_cache_iter_state *st = seq->private;
324 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
325 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
Denis V. Luneva75e9362008-02-28 20:50:55 -0800326 continue;
Denis V. Lunev642d6312008-02-28 20:50:33 -0800327 if (r->rt_genid == st->genid)
328 break;
329 }
330 return r;
331}
332
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900333static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700334{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900335 struct rtable *r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700336
337 if (r)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900338 while (pos && (r = rt_cache_get_next(seq, r)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700339 --pos;
340 return pos ? NULL : r;
341}
342
343static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
344{
Eric Dumazet29e75252008-01-31 17:05:09 -0800345 struct rt_cache_iter_state *st = seq->private;
Eric Dumazet29e75252008-01-31 17:05:09 -0800346 if (*pos)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900347 return rt_cache_get_idx(seq, *pos - 1);
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700348 st->genid = rt_genid(seq_file_net(seq));
Eric Dumazet29e75252008-01-31 17:05:09 -0800349 return SEQ_START_TOKEN;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700350}
351
352static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
353{
Eric Dumazet29e75252008-01-31 17:05:09 -0800354 struct rtable *r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700355
356 if (v == SEQ_START_TOKEN)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900357 r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700358 else
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900359 r = rt_cache_get_next(seq, v);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700360 ++*pos;
361 return r;
362}
363
364static void rt_cache_seq_stop(struct seq_file *seq, void *v)
365{
366 if (v && v != SEQ_START_TOKEN)
367 rcu_read_unlock_bh();
368}
369
370static int rt_cache_seq_show(struct seq_file *seq, void *v)
371{
372 if (v == SEQ_START_TOKEN)
373 seq_printf(seq, "%-127s\n",
374 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
375 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
376 "HHUptod\tSpecDst");
377 else {
378 struct rtable *r = v;
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700379 int len;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700380
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700381 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
382 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700383 r->u.dst.dev ? r->u.dst.dev->name : "*",
384 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
385 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
386 r->u.dst.__use, 0, (unsigned long)r->rt_src,
387 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
388 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
389 dst_metric(&r->u.dst, RTAX_WINDOW),
390 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
391 dst_metric(&r->u.dst, RTAX_RTTVAR)),
392 r->fl.fl4_tos,
393 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
394 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
395 dev_queue_xmit) : 0,
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700396 r->rt_spec_dst, &len);
397
398 seq_printf(seq, "%*s\n", 127 - len, "");
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900399 }
400 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700401}
402
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700403static const struct seq_operations rt_cache_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700404 .start = rt_cache_seq_start,
405 .next = rt_cache_seq_next,
406 .stop = rt_cache_seq_stop,
407 .show = rt_cache_seq_show,
408};
409
410static int rt_cache_seq_open(struct inode *inode, struct file *file)
411{
Denis V. Luneva75e9362008-02-28 20:50:55 -0800412 return seq_open_net(inode, file, &rt_cache_seq_ops,
Pavel Emelyanovcf7732e2007-10-10 02:29:29 -0700413 sizeof(struct rt_cache_iter_state));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700414}
415
Arjan van de Ven9a321442007-02-12 00:55:35 -0800416static const struct file_operations rt_cache_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700417 .owner = THIS_MODULE,
418 .open = rt_cache_seq_open,
419 .read = seq_read,
420 .llseek = seq_lseek,
Denis V. Luneva75e9362008-02-28 20:50:55 -0800421 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700422};
423
424
425static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
426{
427 int cpu;
428
429 if (*pos == 0)
430 return SEQ_START_TOKEN;
431
Rusty Russell0f23174a2008-12-29 12:23:42 +0000432 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700433 if (!cpu_possible(cpu))
434 continue;
435 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800436 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700437 }
438 return NULL;
439}
440
441static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
442{
443 int cpu;
444
Rusty Russell0f23174a2008-12-29 12:23:42 +0000445 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700446 if (!cpu_possible(cpu))
447 continue;
448 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800449 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700450 }
451 return NULL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900452
Linus Torvalds1da177e2005-04-16 15:20:36 -0700453}
454
455static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
456{
457
458}
459
460static int rt_cpu_seq_show(struct seq_file *seq, void *v)
461{
462 struct rt_cache_stat *st = v;
463
464 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700465 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700466 return 0;
467 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900468
Linus Torvalds1da177e2005-04-16 15:20:36 -0700469 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
470 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
471 atomic_read(&ipv4_dst_ops.entries),
472 st->in_hit,
473 st->in_slow_tot,
474 st->in_slow_mc,
475 st->in_no_route,
476 st->in_brd,
477 st->in_martian_dst,
478 st->in_martian_src,
479
480 st->out_hit,
481 st->out_slow_tot,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900482 st->out_slow_mc,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700483
484 st->gc_total,
485 st->gc_ignored,
486 st->gc_goal_miss,
487 st->gc_dst_overflow,
488 st->in_hlist_search,
489 st->out_hlist_search
490 );
491 return 0;
492}
493
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700494static const struct seq_operations rt_cpu_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700495 .start = rt_cpu_seq_start,
496 .next = rt_cpu_seq_next,
497 .stop = rt_cpu_seq_stop,
498 .show = rt_cpu_seq_show,
499};
500
501
502static int rt_cpu_seq_open(struct inode *inode, struct file *file)
503{
504 return seq_open(file, &rt_cpu_seq_ops);
505}
506
Arjan van de Ven9a321442007-02-12 00:55:35 -0800507static const struct file_operations rt_cpu_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700508 .owner = THIS_MODULE,
509 .open = rt_cpu_seq_open,
510 .read = seq_read,
511 .llseek = seq_lseek,
512 .release = seq_release,
513};
514
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800515#ifdef CONFIG_NET_CLS_ROUTE
Alexey Dobriyana661c412009-11-25 15:40:35 -0800516static int rt_acct_proc_show(struct seq_file *m, void *v)
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800517{
Alexey Dobriyana661c412009-11-25 15:40:35 -0800518 struct ip_rt_acct *dst, *src;
519 unsigned int i, j;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800520
Alexey Dobriyana661c412009-11-25 15:40:35 -0800521 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
522 if (!dst)
523 return -ENOMEM;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800524
Alexey Dobriyana661c412009-11-25 15:40:35 -0800525 for_each_possible_cpu(i) {
526 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
527 for (j = 0; j < 256; j++) {
528 dst[j].o_bytes += src[j].o_bytes;
529 dst[j].o_packets += src[j].o_packets;
530 dst[j].i_bytes += src[j].i_bytes;
531 dst[j].i_packets += src[j].i_packets;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800532 }
533 }
Alexey Dobriyana661c412009-11-25 15:40:35 -0800534
535 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
536 kfree(dst);
537 return 0;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800538}
Alexey Dobriyana661c412009-11-25 15:40:35 -0800539
540static int rt_acct_proc_open(struct inode *inode, struct file *file)
541{
542 return single_open(file, rt_acct_proc_show, NULL);
543}
544
545static const struct file_operations rt_acct_proc_fops = {
546 .owner = THIS_MODULE,
547 .open = rt_acct_proc_open,
548 .read = seq_read,
549 .llseek = seq_lseek,
550 .release = single_release,
551};
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800552#endif
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800553
Denis V. Lunev73b38712008-02-28 20:51:18 -0800554static int __net_init ip_rt_do_proc_init(struct net *net)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800555{
556 struct proc_dir_entry *pde;
557
558 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
559 &rt_cache_seq_fops);
560 if (!pde)
561 goto err1;
562
Wang Chen77020722008-02-28 14:14:25 -0800563 pde = proc_create("rt_cache", S_IRUGO,
564 net->proc_net_stat, &rt_cpu_seq_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800565 if (!pde)
566 goto err2;
567
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800568#ifdef CONFIG_NET_CLS_ROUTE
Alexey Dobriyana661c412009-11-25 15:40:35 -0800569 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800570 if (!pde)
571 goto err3;
572#endif
573 return 0;
574
575#ifdef CONFIG_NET_CLS_ROUTE
576err3:
577 remove_proc_entry("rt_cache", net->proc_net_stat);
578#endif
579err2:
580 remove_proc_entry("rt_cache", net->proc_net);
581err1:
582 return -ENOMEM;
583}
Denis V. Lunev73b38712008-02-28 20:51:18 -0800584
585static void __net_exit ip_rt_do_proc_exit(struct net *net)
586{
587 remove_proc_entry("rt_cache", net->proc_net_stat);
588 remove_proc_entry("rt_cache", net->proc_net);
589 remove_proc_entry("rt_acct", net->proc_net);
590}
591
592static struct pernet_operations ip_rt_proc_ops __net_initdata = {
593 .init = ip_rt_do_proc_init,
594 .exit = ip_rt_do_proc_exit,
595};
596
597static int __init ip_rt_proc_init(void)
598{
599 return register_pernet_subsys(&ip_rt_proc_ops);
600}
601
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800602#else
Denis V. Lunev73b38712008-02-28 20:51:18 -0800603static inline int ip_rt_proc_init(void)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800604{
605 return 0;
606}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700607#endif /* CONFIG_PROC_FS */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900608
Stephen Hemminger5969f712008-04-10 01:52:09 -0700609static inline void rt_free(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700610{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700611 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
612}
613
Stephen Hemminger5969f712008-04-10 01:52:09 -0700614static inline void rt_drop(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700615{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700616 ip_rt_put(rt);
617 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
618}
619
Stephen Hemminger5969f712008-04-10 01:52:09 -0700620static inline int rt_fast_clean(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700621{
622 /* Kill broadcast/multicast entries very aggresively, if they
623 collide in hash table with more useful entries */
624 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800625 rth->fl.iif && rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700626}
627
Stephen Hemminger5969f712008-04-10 01:52:09 -0700628static inline int rt_valuable(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700629{
630 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
631 rth->u.dst.expires;
632}
633
634static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
635{
636 unsigned long age;
637 int ret = 0;
638
639 if (atomic_read(&rth->u.dst.__refcnt))
640 goto out;
641
642 ret = 1;
643 if (rth->u.dst.expires &&
644 time_after_eq(jiffies, rth->u.dst.expires))
645 goto out;
646
647 age = jiffies - rth->u.dst.lastuse;
648 ret = 0;
649 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
650 (age <= tmo2 && rt_valuable(rth)))
651 goto out;
652 ret = 1;
653out: return ret;
654}
655
656/* Bits of score are:
657 * 31: very valuable
658 * 30: not quite useless
659 * 29..0: usage counter
660 */
661static inline u32 rt_score(struct rtable *rt)
662{
663 u32 score = jiffies - rt->u.dst.lastuse;
664
665 score = ~score & ~(3<<30);
666
667 if (rt_valuable(rt))
668 score |= (1<<31);
669
670 if (!rt->fl.iif ||
671 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
672 score |= (1<<30);
673
674 return score;
675}
676
Neil Horman1080d702008-10-27 12:28:25 -0700677static inline bool rt_caching(const struct net *net)
678{
679 return net->ipv4.current_rt_cache_rebuild_count <=
680 net->ipv4.sysctl_rt_cache_rebuild_count;
681}
682
683static inline bool compare_hash_inputs(const struct flowi *fl1,
684 const struct flowi *fl2)
685{
686 return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
687 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
688 (fl1->iif ^ fl2->iif)) == 0);
689}
690
Linus Torvalds1da177e2005-04-16 15:20:36 -0700691static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
692{
Al Viro714e85b2006-11-14 20:51:49 -0800693 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
694 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
Thomas Graf47dcf0c2006-11-09 15:20:38 -0800695 (fl1->mark ^ fl2->mark) |
David S. Miller8238b212006-10-12 00:49:15 -0700696 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
697 *(u16 *)&fl2->nl_u.ip4_u.tos) |
698 (fl1->oif ^ fl2->oif) |
699 (fl1->iif ^ fl2->iif)) == 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700700}
701
Denis V. Lunevb5921912008-01-22 23:50:25 -0800702static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
703{
Octavian Purdila09ad9bc2009-11-25 15:14:13 -0800704 return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev));
Denis V. Lunevb5921912008-01-22 23:50:25 -0800705}
706
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700707static inline int rt_is_expired(struct rtable *rth)
708{
709 return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
710}
711
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800712/*
713 * Perform a full scan of hash table and free all entries.
714 * Can be called by a softirq or a process.
715 * In the later case, we want to be reschedule if necessary
716 */
717static void rt_do_flush(int process_context)
718{
719 unsigned int i;
720 struct rtable *rth, *next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700721 struct rtable * tail;
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800722
723 for (i = 0; i <= rt_hash_mask; i++) {
724 if (process_context && need_resched())
725 cond_resched();
726 rth = rt_hash_table[i].chain;
727 if (!rth)
728 continue;
729
730 spin_lock_bh(rt_hash_lock_addr(i));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700731#ifdef CONFIG_NET_NS
732 {
733 struct rtable ** prev, * p;
734
735 rth = rt_hash_table[i].chain;
736
737 /* defer releasing the head of the list after spin_unlock */
738 for (tail = rth; tail; tail = tail->u.dst.rt_next)
739 if (!rt_is_expired(tail))
740 break;
741 if (rth != tail)
742 rt_hash_table[i].chain = tail;
743
744 /* call rt_free on entries after the tail requiring flush */
745 prev = &rt_hash_table[i].chain;
746 for (p = *prev; p; p = next) {
747 next = p->u.dst.rt_next;
748 if (!rt_is_expired(p)) {
749 prev = &p->u.dst.rt_next;
750 } else {
751 *prev = next;
752 rt_free(p);
753 }
754 }
755 }
756#else
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800757 rth = rt_hash_table[i].chain;
758 rt_hash_table[i].chain = NULL;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700759 tail = NULL;
760#endif
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800761 spin_unlock_bh(rt_hash_lock_addr(i));
762
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700763 for (; rth != tail; rth = next) {
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800764 next = rth->u.dst.rt_next;
765 rt_free(rth);
766 }
767 }
768}
769
Neil Horman1080d702008-10-27 12:28:25 -0700770/*
771 * While freeing expired entries, we compute average chain length
772 * and standard deviation, using fixed-point arithmetic.
773 * This to have an estimation of rt_chain_length_max
774 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
775 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
776 */
777
778#define FRACT_BITS 3
779#define ONE (1UL << FRACT_BITS)
780
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800781static void rt_check_expire(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700782{
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700783 static unsigned int rover;
784 unsigned int i = rover, goal;
Eric Dumazet1ddbcb02009-05-19 20:14:28 +0000785 struct rtable *rth, *aux, **rthp;
Eric Dumazetcf8da762009-05-19 18:54:22 +0000786 unsigned long samples = 0;
Neil Horman1080d702008-10-27 12:28:25 -0700787 unsigned long sum = 0, sum2 = 0;
Eric Dumazet125bb8f2009-06-11 20:10:07 +0000788 unsigned long delta;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700789 u64 mult;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700790
Eric Dumazet125bb8f2009-06-11 20:10:07 +0000791 delta = jiffies - expires_ljiffies;
792 expires_ljiffies = jiffies;
793 mult = ((u64)delta) << rt_hash_log;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700794 if (ip_rt_gc_timeout > 1)
795 do_div(mult, ip_rt_gc_timeout);
796 goal = (unsigned int)mult;
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700797 if (goal > rt_hash_mask)
798 goal = rt_hash_mask + 1;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700799 for (; goal > 0; goal--) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700800 unsigned long tmo = ip_rt_gc_timeout;
Eric Dumazetcf8da762009-05-19 18:54:22 +0000801 unsigned long length;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700802
803 i = (i + 1) & rt_hash_mask;
804 rthp = &rt_hash_table[i].chain;
805
Eric Dumazetd90bf5a2007-11-14 16:14:05 -0800806 if (need_resched())
807 cond_resched();
808
Neil Horman1080d702008-10-27 12:28:25 -0700809 samples++;
810
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700811 if (*rthp == NULL)
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700812 continue;
Eric Dumazetcf8da762009-05-19 18:54:22 +0000813 length = 0;
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700814 spin_lock_bh(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700815 while ((rth = *rthp) != NULL) {
Eric Dumazet1ddbcb02009-05-19 20:14:28 +0000816 prefetch(rth->u.dst.rt_next);
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700817 if (rt_is_expired(rth)) {
Eric Dumazet29e75252008-01-31 17:05:09 -0800818 *rthp = rth->u.dst.rt_next;
819 rt_free(rth);
820 continue;
821 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700822 if (rth->u.dst.expires) {
823 /* Entry is expired even if it is in use */
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700824 if (time_before_eq(jiffies, rth->u.dst.expires)) {
Eric Dumazet1ddbcb02009-05-19 20:14:28 +0000825nofree:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700826 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800827 rthp = &rth->u.dst.rt_next;
Neil Horman1080d702008-10-27 12:28:25 -0700828 /*
Eric Dumazet1ddbcb02009-05-19 20:14:28 +0000829 * We only count entries on
Neil Horman1080d702008-10-27 12:28:25 -0700830 * a chain with equal hash inputs once
831 * so that entries for different QOS
832 * levels, and other non-hash input
833 * attributes don't unfairly skew
834 * the length computation
835 */
Eric Dumazet1ddbcb02009-05-19 20:14:28 +0000836 for (aux = rt_hash_table[i].chain;;) {
837 if (aux == rth) {
838 length += ONE;
839 break;
840 }
841 if (compare_hash_inputs(&aux->fl, &rth->fl))
842 break;
843 aux = aux->u.dst.rt_next;
844 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700845 continue;
846 }
Eric Dumazet1ddbcb02009-05-19 20:14:28 +0000847 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
848 goto nofree;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700849
850 /* Cleanup aged off entries. */
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800851 *rthp = rth->u.dst.rt_next;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900852 rt_free(rth);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700853 }
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700854 spin_unlock_bh(rt_hash_lock_addr(i));
Neil Horman1080d702008-10-27 12:28:25 -0700855 sum += length;
856 sum2 += length*length;
857 }
858 if (samples) {
859 unsigned long avg = sum / samples;
860 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
861 rt_chain_length_max = max_t(unsigned long,
862 ip_rt_gc_elasticity,
863 (avg + 4*sd) >> FRACT_BITS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700864 }
865 rover = i;
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800866}
867
868/*
869 * rt_worker_func() is run in process context.
Eric Dumazet29e75252008-01-31 17:05:09 -0800870 * we call rt_check_expire() to scan part of the hash table
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800871 */
872static void rt_worker_func(struct work_struct *work)
873{
Eric Dumazet29e75252008-01-31 17:05:09 -0800874 rt_check_expire();
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700875 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700876}
877
Eric Dumazet29e75252008-01-31 17:05:09 -0800878/*
879 * Pertubation of rt_genid by a small quantity [1..256]
880 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
881 * many times (2^24) without giving recent rt_genid.
882 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700883 */
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700884static void rt_cache_invalidate(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700885{
Eric Dumazet29e75252008-01-31 17:05:09 -0800886 unsigned char shuffle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700887
Eric Dumazet29e75252008-01-31 17:05:09 -0800888 get_random_bytes(&shuffle, sizeof(shuffle));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700889 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700890}
891
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800892/*
Eric Dumazet29e75252008-01-31 17:05:09 -0800893 * delay < 0 : invalidate cache (fast : entries will be deleted later)
894 * delay >= 0 : invalidate & flush cache (can be long)
895 */
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -0700896void rt_cache_flush(struct net *net, int delay)
Eric Dumazet29e75252008-01-31 17:05:09 -0800897{
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700898 rt_cache_invalidate(net);
Eric Dumazet29e75252008-01-31 17:05:09 -0800899 if (delay >= 0)
900 rt_do_flush(!in_softirq());
901}
902
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000903/* Flush previous cache invalidated entries from the cache */
904void rt_cache_flush_batch(void)
905{
906 rt_do_flush(!in_softirq());
907}
908
Eric Dumazet29e75252008-01-31 17:05:09 -0800909/*
910 * We change rt_genid and let gc do the cleanup
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800911 */
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -0700912static void rt_secret_rebuild(unsigned long __net)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700913{
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -0700914 struct net *net = (struct net *)__net;
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700915 rt_cache_invalidate(net);
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -0700916 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700917}
918
Neil Horman1080d702008-10-27 12:28:25 -0700919static void rt_secret_rebuild_oneshot(struct net *net)
920{
921 del_timer_sync(&net->ipv4.rt_secret_timer);
922 rt_cache_invalidate(net);
923 if (ip_rt_secret_interval) {
924 net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval;
925 add_timer(&net->ipv4.rt_secret_timer);
926 }
927}
928
929static void rt_emergency_hash_rebuild(struct net *net)
930{
931 if (net_ratelimit()) {
932 printk(KERN_WARNING "Route hash chain too long!\n");
933 printk(KERN_WARNING "Adjust your secret_interval!\n");
934 }
935
936 rt_secret_rebuild_oneshot(net);
937}
938
Linus Torvalds1da177e2005-04-16 15:20:36 -0700939/*
940 Short description of GC goals.
941
942 We want to build algorithm, which will keep routing cache
943 at some equilibrium point, when number of aged off entries
944 is kept approximately equal to newly generated ones.
945
946 Current expiration strength is variable "expire".
947 We try to adjust it dynamically, so that if networking
948 is idle expires is large enough to keep enough of warm entries,
949 and when load increases it reduces to limit cache size.
950 */
951
Daniel Lezcano569d3642008-01-18 03:56:57 -0800952static int rt_garbage_collect(struct dst_ops *ops)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700953{
954 static unsigned long expire = RT_GC_TIMEOUT;
955 static unsigned long last_gc;
956 static int rover;
957 static int equilibrium;
958 struct rtable *rth, **rthp;
959 unsigned long now = jiffies;
960 int goal;
961
962 /*
963 * Garbage collection is pretty expensive,
964 * do not make it too frequently.
965 */
966
967 RT_CACHE_STAT_INC(gc_total);
968
969 if (now - last_gc < ip_rt_gc_min_interval &&
970 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
971 RT_CACHE_STAT_INC(gc_ignored);
972 goto out;
973 }
974
975 /* Calculate number of entries, which we want to expire now. */
976 goal = atomic_read(&ipv4_dst_ops.entries) -
977 (ip_rt_gc_elasticity << rt_hash_log);
978 if (goal <= 0) {
979 if (equilibrium < ipv4_dst_ops.gc_thresh)
980 equilibrium = ipv4_dst_ops.gc_thresh;
981 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
982 if (goal > 0) {
Eric Dumazetb790ced2007-12-21 01:49:07 -0800983 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700984 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
985 }
986 } else {
987 /* We are in dangerous area. Try to reduce cache really
988 * aggressively.
989 */
Eric Dumazetb790ced2007-12-21 01:49:07 -0800990 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700991 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
992 }
993
994 if (now - last_gc >= ip_rt_gc_min_interval)
995 last_gc = now;
996
997 if (goal <= 0) {
998 equilibrium += goal;
999 goto work_done;
1000 }
1001
1002 do {
1003 int i, k;
1004
1005 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1006 unsigned long tmo = expire;
1007
1008 k = (k + 1) & rt_hash_mask;
1009 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001010 spin_lock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001011 while ((rth = *rthp) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001012 if (!rt_is_expired(rth) &&
Eric Dumazet29e75252008-01-31 17:05:09 -08001013 !rt_may_expire(rth, tmo, expire)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001014 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001015 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001016 continue;
1017 }
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001018 *rthp = rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001019 rt_free(rth);
1020 goal--;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001021 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001022 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001023 if (goal <= 0)
1024 break;
1025 }
1026 rover = k;
1027
1028 if (goal <= 0)
1029 goto work_done;
1030
1031 /* Goal is not achieved. We stop process if:
1032
1033 - if expire reduced to zero. Otherwise, expire is halfed.
1034 - if table is not full.
1035 - if we are called from interrupt.
1036 - jiffies check is just fallback/debug loop breaker.
1037 We will not spin here for long time in any case.
1038 */
1039
1040 RT_CACHE_STAT_INC(gc_goal_miss);
1041
1042 if (expire == 0)
1043 break;
1044
1045 expire >>= 1;
1046#if RT_CACHE_DEBUG >= 2
1047 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1048 atomic_read(&ipv4_dst_ops.entries), goal, i);
1049#endif
1050
1051 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1052 goto out;
1053 } while (!in_softirq() && time_before_eq(jiffies, now));
1054
1055 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1056 goto out;
1057 if (net_ratelimit())
1058 printk(KERN_WARNING "dst cache overflow\n");
1059 RT_CACHE_STAT_INC(gc_dst_overflow);
1060 return 1;
1061
1062work_done:
1063 expire += ip_rt_gc_min_interval;
1064 if (expire > ip_rt_gc_timeout ||
1065 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1066 expire = ip_rt_gc_timeout;
1067#if RT_CACHE_DEBUG >= 2
1068 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1069 atomic_read(&ipv4_dst_ops.entries), goal, rover);
1070#endif
1071out: return 0;
1072}
1073
Eric Dumazet511c3f92009-06-02 05:14:27 +00001074static int rt_intern_hash(unsigned hash, struct rtable *rt,
1075 struct rtable **rp, struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001076{
1077 struct rtable *rth, **rthp;
1078 unsigned long now;
1079 struct rtable *cand, **candp;
1080 u32 min_score;
1081 int chain_length;
1082 int attempts = !in_softirq();
1083
1084restart:
1085 chain_length = 0;
1086 min_score = ~(u32)0;
1087 cand = NULL;
1088 candp = NULL;
1089 now = jiffies;
1090
Neil Horman1080d702008-10-27 12:28:25 -07001091 if (!rt_caching(dev_net(rt->u.dst.dev))) {
Neil Horman73e42892009-06-20 01:15:16 -07001092 /*
1093 * If we're not caching, just tell the caller we
1094 * were successful and don't touch the route. The
1095 * caller hold the sole reference to the cache entry, and
1096 * it will be released when the caller is done with it.
1097 * If we drop it here, the callers have no way to resolve routes
1098 * when we're not caching. Instead, just point *rp at rt, so
1099 * the caller gets a single use out of the route
Neil Hormanb6280b42009-06-22 10:18:53 +00001100 * Note that we do rt_free on this new route entry, so that
1101 * once its refcount hits zero, we are still able to reap it
1102 * (Thanks Alexey)
1103 * Note also the rt_free uses call_rcu. We don't actually
1104 * need rcu protection here, this is just our path to get
1105 * on the route gc list.
Neil Horman73e42892009-06-20 01:15:16 -07001106 */
Neil Hormanb6280b42009-06-22 10:18:53 +00001107
1108 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1109 int err = arp_bind_neighbour(&rt->u.dst);
1110 if (err) {
1111 if (net_ratelimit())
1112 printk(KERN_WARNING
1113 "Neighbour table failure & not caching routes.\n");
1114 rt_drop(rt);
1115 return err;
1116 }
1117 }
1118
1119 rt_free(rt);
1120 goto skip_hashing;
Neil Horman1080d702008-10-27 12:28:25 -07001121 }
1122
Linus Torvalds1da177e2005-04-16 15:20:36 -07001123 rthp = &rt_hash_table[hash].chain;
1124
Eric Dumazet22c047c2005-07-05 14:55:24 -07001125 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001126 while ((rth = *rthp) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001127 if (rt_is_expired(rth)) {
Eric Dumazet29e75252008-01-31 17:05:09 -08001128 *rthp = rth->u.dst.rt_next;
1129 rt_free(rth);
1130 continue;
1131 }
Denis V. Lunevb5921912008-01-22 23:50:25 -08001132 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001133 /* Put it first */
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001134 *rthp = rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001135 /*
1136 * Since lookup is lockfree, the deletion
1137 * must be visible to another weakly ordered CPU before
1138 * the insertion at the start of the hash chain.
1139 */
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001140 rcu_assign_pointer(rth->u.dst.rt_next,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001141 rt_hash_table[hash].chain);
1142 /*
1143 * Since lookup is lockfree, the update writes
1144 * must be ordered for consistency on SMP.
1145 */
1146 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1147
Pavel Emelyanov03f49f32007-11-10 21:28:34 -08001148 dst_use(&rth->u.dst, now);
Eric Dumazet22c047c2005-07-05 14:55:24 -07001149 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001150
1151 rt_drop(rt);
Eric Dumazet511c3f92009-06-02 05:14:27 +00001152 if (rp)
1153 *rp = rth;
1154 else
Eric Dumazetadf30902009-06-02 05:19:30 +00001155 skb_dst_set(skb, &rth->u.dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001156 return 0;
1157 }
1158
1159 if (!atomic_read(&rth->u.dst.__refcnt)) {
1160 u32 score = rt_score(rth);
1161
1162 if (score <= min_score) {
1163 cand = rth;
1164 candp = rthp;
1165 min_score = score;
1166 }
1167 }
1168
1169 chain_length++;
1170
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001171 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001172 }
1173
1174 if (cand) {
1175 /* ip_rt_gc_elasticity used to be average length of chain
1176 * length, when exceeded gc becomes really aggressive.
1177 *
1178 * The second limit is less certain. At the moment it allows
1179 * only 2 entries per bucket. We will see.
1180 */
1181 if (chain_length > ip_rt_gc_elasticity) {
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001182 *candp = cand->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001183 rt_free(cand);
1184 }
Neil Horman1080d702008-10-27 12:28:25 -07001185 } else {
1186 if (chain_length > rt_chain_length_max) {
1187 struct net *net = dev_net(rt->u.dst.dev);
1188 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1189 if (!rt_caching(dev_net(rt->u.dst.dev))) {
1190 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1191 rt->u.dst.dev->name, num);
1192 }
1193 rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
1194 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001195 }
1196
1197 /* Try to bind route to arp only if it is output
1198 route or unicast forwarding path.
1199 */
1200 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1201 int err = arp_bind_neighbour(&rt->u.dst);
1202 if (err) {
Eric Dumazet22c047c2005-07-05 14:55:24 -07001203 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001204
1205 if (err != -ENOBUFS) {
1206 rt_drop(rt);
1207 return err;
1208 }
1209
1210 /* Neighbour tables are full and nothing
1211 can be released. Try to shrink route cache,
1212 it is most likely it holds some neighbour records.
1213 */
1214 if (attempts-- > 0) {
1215 int saved_elasticity = ip_rt_gc_elasticity;
1216 int saved_int = ip_rt_gc_min_interval;
1217 ip_rt_gc_elasticity = 1;
1218 ip_rt_gc_min_interval = 0;
Daniel Lezcano569d3642008-01-18 03:56:57 -08001219 rt_garbage_collect(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001220 ip_rt_gc_min_interval = saved_int;
1221 ip_rt_gc_elasticity = saved_elasticity;
1222 goto restart;
1223 }
1224
1225 if (net_ratelimit())
1226 printk(KERN_WARNING "Neighbour table overflow.\n");
1227 rt_drop(rt);
1228 return -ENOBUFS;
1229 }
1230 }
1231
Eric Dumazet1ddbcb02009-05-19 20:14:28 +00001232 rt->u.dst.rt_next = rt_hash_table[hash].chain;
Neil Horman1080d702008-10-27 12:28:25 -07001233
Linus Torvalds1da177e2005-04-16 15:20:36 -07001234#if RT_CACHE_DEBUG >= 2
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001235 if (rt->u.dst.rt_next) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001236 struct rtable *trt;
Neil Hormanb6280b42009-06-22 10:18:53 +00001237 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1238 hash, &rt->rt_dst);
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001239 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
Harvey Harrison673d57e2008-10-31 00:53:57 -07001240 printk(" . %pI4", &trt->rt_dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001241 printk("\n");
1242 }
1243#endif
Eric Dumazet00269b52008-10-16 14:18:29 -07001244 /*
1245 * Since lookup is lockfree, we must make sure
1246 * previous writes to rt are comitted to memory
1247 * before making rt visible to other CPUS.
1248 */
Eric Dumazet1ddbcb02009-05-19 20:14:28 +00001249 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
Neil Horman1080d702008-10-27 12:28:25 -07001250
Eric Dumazet22c047c2005-07-05 14:55:24 -07001251 spin_unlock_bh(rt_hash_lock_addr(hash));
Neil Horman73e42892009-06-20 01:15:16 -07001252
Neil Hormanb6280b42009-06-22 10:18:53 +00001253skip_hashing:
Eric Dumazet511c3f92009-06-02 05:14:27 +00001254 if (rp)
1255 *rp = rt;
1256 else
Eric Dumazetadf30902009-06-02 05:19:30 +00001257 skb_dst_set(skb, &rt->u.dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001258 return 0;
1259}
1260
1261void rt_bind_peer(struct rtable *rt, int create)
1262{
1263 static DEFINE_SPINLOCK(rt_peer_lock);
1264 struct inet_peer *peer;
1265
1266 peer = inet_getpeer(rt->rt_dst, create);
1267
1268 spin_lock_bh(&rt_peer_lock);
1269 if (rt->peer == NULL) {
1270 rt->peer = peer;
1271 peer = NULL;
1272 }
1273 spin_unlock_bh(&rt_peer_lock);
1274 if (peer)
1275 inet_putpeer(peer);
1276}
1277
1278/*
1279 * Peer allocation may fail only in serious out-of-memory conditions. However
1280 * we still can generate some output.
1281 * Random ID selection looks a bit dangerous because we have no chances to
1282 * select ID being unique in a reasonable period of time.
1283 * But broken packet identifier may be better than no packet at all.
1284 */
1285static void ip_select_fb_ident(struct iphdr *iph)
1286{
1287 static DEFINE_SPINLOCK(ip_fb_id_lock);
1288 static u32 ip_fallback_id;
1289 u32 salt;
1290
1291 spin_lock_bh(&ip_fb_id_lock);
Al Viroe4485152006-09-26 22:15:01 -07001292 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001293 iph->id = htons(salt & 0xFFFF);
1294 ip_fallback_id = salt;
1295 spin_unlock_bh(&ip_fb_id_lock);
1296}
1297
1298void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1299{
1300 struct rtable *rt = (struct rtable *) dst;
1301
1302 if (rt) {
1303 if (rt->peer == NULL)
1304 rt_bind_peer(rt, 1);
1305
1306 /* If peer is attached to destination, it is never detached,
1307 so that we need not to grab a lock to dereference it.
1308 */
1309 if (rt->peer) {
1310 iph->id = htons(inet_getid(rt->peer, more));
1311 return;
1312 }
1313 } else
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001314 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
Stephen Hemminger9c2b3322005-04-19 22:39:42 -07001315 __builtin_return_address(0));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001316
1317 ip_select_fb_ident(iph);
1318}
1319
1320static void rt_del(unsigned hash, struct rtable *rt)
1321{
Eric Dumazet29e75252008-01-31 17:05:09 -08001322 struct rtable **rthp, *aux;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001323
Eric Dumazet29e75252008-01-31 17:05:09 -08001324 rthp = &rt_hash_table[hash].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001325 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001326 ip_rt_put(rt);
Eric Dumazet29e75252008-01-31 17:05:09 -08001327 while ((aux = *rthp) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001328 if (aux == rt || rt_is_expired(aux)) {
Eric Dumazet29e75252008-01-31 17:05:09 -08001329 *rthp = aux->u.dst.rt_next;
1330 rt_free(aux);
1331 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001332 }
Eric Dumazet29e75252008-01-31 17:05:09 -08001333 rthp = &aux->u.dst.rt_next;
1334 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001335 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001336}
1337
Al Virof7655222006-09-26 21:25:43 -07001338void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1339 __be32 saddr, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001340{
1341 int i, k;
1342 struct in_device *in_dev = in_dev_get(dev);
1343 struct rtable *rth, **rthp;
Al Virof7655222006-09-26 21:25:43 -07001344 __be32 skeys[2] = { saddr, 0 };
Linus Torvalds1da177e2005-04-16 15:20:36 -07001345 int ikeys[2] = { dev->ifindex, 0 };
Tom Tucker8d717402006-07-30 20:43:36 -07001346 struct netevent_redirect netevent;
Denis V. Lunev317805b2008-02-28 20:50:06 -08001347 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001348
Linus Torvalds1da177e2005-04-16 15:20:36 -07001349 if (!in_dev)
1350 return;
1351
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001352 net = dev_net(dev);
Joe Perches9d4fb272009-11-23 10:41:23 -08001353 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1354 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1355 ipv4_is_zeronet(new_gw))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001356 goto reject_redirect;
1357
Neil Horman1080d702008-10-27 12:28:25 -07001358 if (!rt_caching(net))
1359 goto reject_redirect;
1360
Linus Torvalds1da177e2005-04-16 15:20:36 -07001361 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1362 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1363 goto reject_redirect;
1364 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1365 goto reject_redirect;
1366 } else {
Denis V. Lunev317805b2008-02-28 20:50:06 -08001367 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001368 goto reject_redirect;
1369 }
1370
1371 for (i = 0; i < 2; i++) {
1372 for (k = 0; k < 2; k++) {
Denis V. Lunevb00180d2008-07-05 19:04:09 -07001373 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001374 rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001375
1376 rthp=&rt_hash_table[hash].chain;
1377
1378 rcu_read_lock();
1379 while ((rth = rcu_dereference(*rthp)) != NULL) {
1380 struct rtable *rt;
1381
1382 if (rth->fl.fl4_dst != daddr ||
1383 rth->fl.fl4_src != skeys[i] ||
Linus Torvalds1da177e2005-04-16 15:20:36 -07001384 rth->fl.oif != ikeys[k] ||
Eric Dumazet29e75252008-01-31 17:05:09 -08001385 rth->fl.iif != 0 ||
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001386 rt_is_expired(rth) ||
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09001387 !net_eq(dev_net(rth->u.dst.dev), net)) {
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001388 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001389 continue;
1390 }
1391
1392 if (rth->rt_dst != daddr ||
1393 rth->rt_src != saddr ||
1394 rth->u.dst.error ||
1395 rth->rt_gateway != old_gw ||
1396 rth->u.dst.dev != dev)
1397 break;
1398
1399 dst_hold(&rth->u.dst);
1400 rcu_read_unlock();
1401
1402 rt = dst_alloc(&ipv4_dst_ops);
1403 if (rt == NULL) {
1404 ip_rt_put(rth);
1405 in_dev_put(in_dev);
1406 return;
1407 }
1408
1409 /* Copy all the information. */
1410 *rt = *rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001411 rt->u.dst.__use = 1;
1412 atomic_set(&rt->u.dst.__refcnt, 1);
1413 rt->u.dst.child = NULL;
1414 if (rt->u.dst.dev)
1415 dev_hold(rt->u.dst.dev);
1416 if (rt->idev)
1417 in_dev_hold(rt->idev);
1418 rt->u.dst.obsolete = 0;
1419 rt->u.dst.lastuse = jiffies;
1420 rt->u.dst.path = &rt->u.dst;
1421 rt->u.dst.neighbour = NULL;
1422 rt->u.dst.hh = NULL;
Alexey Dobriyandef8b4f2008-10-28 13:24:06 -07001423#ifdef CONFIG_XFRM
Linus Torvalds1da177e2005-04-16 15:20:36 -07001424 rt->u.dst.xfrm = NULL;
Alexey Dobriyandef8b4f2008-10-28 13:24:06 -07001425#endif
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001426 rt->rt_genid = rt_genid(net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001427 rt->rt_flags |= RTCF_REDIRECTED;
1428
1429 /* Gateway is different ... */
1430 rt->rt_gateway = new_gw;
1431
1432 /* Redirect received -> path was valid */
1433 dst_confirm(&rth->u.dst);
1434
1435 if (rt->peer)
1436 atomic_inc(&rt->peer->refcnt);
1437
1438 if (arp_bind_neighbour(&rt->u.dst) ||
1439 !(rt->u.dst.neighbour->nud_state &
1440 NUD_VALID)) {
1441 if (rt->u.dst.neighbour)
1442 neigh_event_send(rt->u.dst.neighbour, NULL);
1443 ip_rt_put(rth);
1444 rt_drop(rt);
1445 goto do_next;
1446 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001447
Tom Tucker8d717402006-07-30 20:43:36 -07001448 netevent.old = &rth->u.dst;
1449 netevent.new = &rt->u.dst;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001450 call_netevent_notifiers(NETEVENT_REDIRECT,
1451 &netevent);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001452
1453 rt_del(hash, rth);
Eric Dumazet511c3f92009-06-02 05:14:27 +00001454 if (!rt_intern_hash(hash, rt, &rt, NULL))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001455 ip_rt_put(rt);
1456 goto do_next;
1457 }
1458 rcu_read_unlock();
1459 do_next:
1460 ;
1461 }
1462 }
1463 in_dev_put(in_dev);
1464 return;
1465
1466reject_redirect:
1467#ifdef CONFIG_IP_ROUTE_VERBOSE
1468 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07001469 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1470 " Advised path = %pI4 -> %pI4\n",
1471 &old_gw, dev->name, &new_gw,
1472 &saddr, &daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001473#endif
1474 in_dev_put(in_dev);
1475}
1476
1477static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1478{
Eric Dumazetee6b9672008-03-05 18:30:47 -08001479 struct rtable *rt = (struct rtable *)dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001480 struct dst_entry *ret = dst;
1481
1482 if (rt) {
1483 if (dst->obsolete) {
1484 ip_rt_put(rt);
1485 ret = NULL;
1486 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1487 rt->u.dst.expires) {
Al Viro8c7bc842006-09-26 21:26:19 -07001488 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
Denis V. Lunevb00180d2008-07-05 19:04:09 -07001489 rt->fl.oif,
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001490 rt_genid(dev_net(dst->dev)));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001491#if RT_CACHE_DEBUG >= 1
Harvey Harrison673d57e2008-10-31 00:53:57 -07001492 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1493 &rt->rt_dst, rt->fl.fl4_tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001494#endif
1495 rt_del(hash, rt);
1496 ret = NULL;
1497 }
1498 }
1499 return ret;
1500}
1501
1502/*
1503 * Algorithm:
1504 * 1. The first ip_rt_redirect_number redirects are sent
1505 * with exponential backoff, then we stop sending them at all,
1506 * assuming that the host ignores our redirects.
1507 * 2. If we did not see packets requiring redirects
1508 * during ip_rt_redirect_silence, we assume that the host
1509 * forgot redirected route and start to send redirects again.
1510 *
1511 * This algorithm is much cheaper and more intelligent than dumb load limiting
1512 * in icmp.c.
1513 *
1514 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1515 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1516 */
1517
1518void ip_rt_send_redirect(struct sk_buff *skb)
1519{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001520 struct rtable *rt = skb_rtable(skb);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001521 struct in_device *in_dev;
1522 int log_martians;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001523
Eric Dumazet30038fc2009-08-28 23:52:01 -07001524 rcu_read_lock();
1525 in_dev = __in_dev_get_rcu(rt->u.dst.dev);
1526 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1527 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001528 return;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001529 }
1530 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1531 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001532
1533 /* No redirected packets during ip_rt_redirect_silence;
1534 * reset the algorithm.
1535 */
1536 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1537 rt->u.dst.rate_tokens = 0;
1538
1539 /* Too many ignored redirects; do not send anything
1540 * set u.dst.rate_last to the last seen redirected packet.
1541 */
1542 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1543 rt->u.dst.rate_last = jiffies;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001544 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001545 }
1546
1547 /* Check for load limit; set rate_last to the latest sent
1548 * redirect.
1549 */
Li Yewang14fb8a72006-12-18 00:26:35 -08001550 if (rt->u.dst.rate_tokens == 0 ||
1551 time_after(jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001552 (rt->u.dst.rate_last +
1553 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1554 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1555 rt->u.dst.rate_last = jiffies;
1556 ++rt->u.dst.rate_tokens;
1557#ifdef CONFIG_IP_ROUTE_VERBOSE
Eric Dumazet30038fc2009-08-28 23:52:01 -07001558 if (log_martians &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001559 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1560 net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07001561 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1562 &rt->rt_src, rt->rt_iif,
1563 &rt->rt_dst, &rt->rt_gateway);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001564#endif
1565 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001566}
1567
1568static int ip_error(struct sk_buff *skb)
1569{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001570 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001571 unsigned long now;
1572 int code;
1573
1574 switch (rt->u.dst.error) {
1575 case EINVAL:
1576 default:
1577 goto out;
1578 case EHOSTUNREACH:
1579 code = ICMP_HOST_UNREACH;
1580 break;
1581 case ENETUNREACH:
1582 code = ICMP_NET_UNREACH;
Pavel Emelyanov7c73a6f2008-07-16 20:20:11 -07001583 IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1584 IPSTATS_MIB_INNOROUTES);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001585 break;
1586 case EACCES:
1587 code = ICMP_PKT_FILTERED;
1588 break;
1589 }
1590
1591 now = jiffies;
1592 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1593 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1594 rt->u.dst.rate_tokens = ip_rt_error_burst;
1595 rt->u.dst.rate_last = now;
1596 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1597 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1598 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1599 }
1600
1601out: kfree_skb(skb);
1602 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001603}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001604
1605/*
1606 * The last two values are not from the RFC but
1607 * are needed for AMPRnet AX.25 paths.
1608 */
1609
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001610static const unsigned short mtu_plateau[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -07001611{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1612
Stephen Hemminger5969f712008-04-10 01:52:09 -07001613static inline unsigned short guess_mtu(unsigned short old_mtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001614{
1615 int i;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001616
Linus Torvalds1da177e2005-04-16 15:20:36 -07001617 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1618 if (old_mtu > mtu_plateau[i])
1619 return mtu_plateau[i];
1620 return 68;
1621}
1622
Denis V. Lunevb5921912008-01-22 23:50:25 -08001623unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
Timo Teras0010e462008-04-29 03:32:25 -07001624 unsigned short new_mtu,
1625 struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001626{
Timo Teras0010e462008-04-29 03:32:25 -07001627 int i, k;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001628 unsigned short old_mtu = ntohs(iph->tot_len);
1629 struct rtable *rth;
Timo Teras0010e462008-04-29 03:32:25 -07001630 int ikeys[2] = { dev->ifindex, 0 };
Al Viroe4485152006-09-26 22:15:01 -07001631 __be32 skeys[2] = { iph->saddr, 0, };
1632 __be32 daddr = iph->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001633 unsigned short est_mtu = 0;
1634
Timo Teras0010e462008-04-29 03:32:25 -07001635 for (k = 0; k < 2; k++) {
1636 for (i = 0; i < 2; i++) {
Denis V. Lunevb00180d2008-07-05 19:04:09 -07001637 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001638 rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001639
Timo Teras0010e462008-04-29 03:32:25 -07001640 rcu_read_lock();
1641 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1642 rth = rcu_dereference(rth->u.dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001643 unsigned short mtu = new_mtu;
1644
Timo Teras0010e462008-04-29 03:32:25 -07001645 if (rth->fl.fl4_dst != daddr ||
1646 rth->fl.fl4_src != skeys[i] ||
1647 rth->rt_dst != daddr ||
1648 rth->rt_src != iph->saddr ||
1649 rth->fl.oif != ikeys[k] ||
1650 rth->fl.iif != 0 ||
1651 dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1652 !net_eq(dev_net(rth->u.dst.dev), net) ||
Hugh Dickins6c3b8fc2008-07-26 17:51:06 -07001653 rt_is_expired(rth))
Timo Teras0010e462008-04-29 03:32:25 -07001654 continue;
1655
Linus Torvalds1da177e2005-04-16 15:20:36 -07001656 if (new_mtu < 68 || new_mtu >= old_mtu) {
1657
1658 /* BSD 4.2 compatibility hack :-( */
1659 if (mtu == 0 &&
Rami Rosen6d273f82008-08-06 02:33:49 -07001660 old_mtu >= dst_mtu(&rth->u.dst) &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001661 old_mtu >= 68 + (iph->ihl << 2))
1662 old_mtu -= iph->ihl << 2;
1663
1664 mtu = guess_mtu(old_mtu);
1665 }
Rami Rosen6d273f82008-08-06 02:33:49 -07001666 if (mtu <= dst_mtu(&rth->u.dst)) {
1667 if (mtu < dst_mtu(&rth->u.dst)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001668 dst_confirm(&rth->u.dst);
1669 if (mtu < ip_rt_min_pmtu) {
1670 mtu = ip_rt_min_pmtu;
1671 rth->u.dst.metrics[RTAX_LOCK-1] |=
1672 (1 << RTAX_MTU);
1673 }
1674 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1675 dst_set_expires(&rth->u.dst,
1676 ip_rt_mtu_expires);
1677 }
1678 est_mtu = mtu;
1679 }
1680 }
Timo Teras0010e462008-04-29 03:32:25 -07001681 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001682 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001683 }
1684 return est_mtu ? : new_mtu;
1685}
1686
1687static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1688{
Rami Rosen6d273f82008-08-06 02:33:49 -07001689 if (dst_mtu(dst) > mtu && mtu >= 68 &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001690 !(dst_metric_locked(dst, RTAX_MTU))) {
1691 if (mtu < ip_rt_min_pmtu) {
1692 mtu = ip_rt_min_pmtu;
1693 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1694 }
1695 dst->metrics[RTAX_MTU-1] = mtu;
1696 dst_set_expires(dst, ip_rt_mtu_expires);
Tom Tucker8d717402006-07-30 20:43:36 -07001697 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001698 }
1699}
1700
1701static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1702{
1703 return NULL;
1704}
1705
1706static void ipv4_dst_destroy(struct dst_entry *dst)
1707{
1708 struct rtable *rt = (struct rtable *) dst;
1709 struct inet_peer *peer = rt->peer;
1710 struct in_device *idev = rt->idev;
1711
1712 if (peer) {
1713 rt->peer = NULL;
1714 inet_putpeer(peer);
1715 }
1716
1717 if (idev) {
1718 rt->idev = NULL;
1719 in_dev_put(idev);
1720 }
1721}
1722
1723static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1724 int how)
1725{
1726 struct rtable *rt = (struct rtable *) dst;
1727 struct in_device *idev = rt->idev;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001728 if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
Denis V. Lunev5a3e55d2007-12-07 00:38:10 -08001729 struct in_device *loopback_idev =
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001730 in_dev_get(dev_net(dev)->loopback_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001731 if (loopback_idev) {
1732 rt->idev = loopback_idev;
1733 in_dev_put(idev);
1734 }
1735 }
1736}
1737
1738static void ipv4_link_failure(struct sk_buff *skb)
1739{
1740 struct rtable *rt;
1741
1742 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1743
Eric Dumazet511c3f92009-06-02 05:14:27 +00001744 rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001745 if (rt)
1746 dst_set_expires(&rt->u.dst, 0);
1747}
1748
1749static int ip_rt_bug(struct sk_buff *skb)
1750{
Harvey Harrison673d57e2008-10-31 00:53:57 -07001751 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1752 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001753 skb->dev ? skb->dev->name : "?");
1754 kfree_skb(skb);
1755 return 0;
1756}
1757
1758/*
1759 We do not cache source address of outgoing interface,
1760 because it is used only by IP RR, TS and SRR options,
1761 so that it out of fast path.
1762
1763 BTW remember: "addr" is allowed to be not aligned
1764 in IP options!
1765 */
1766
1767void ip_rt_get_source(u8 *addr, struct rtable *rt)
1768{
Al Viroa61ced52006-09-26 21:27:54 -07001769 __be32 src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001770 struct fib_result res;
1771
1772 if (rt->fl.iif == 0)
1773 src = rt->rt_src;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001774 else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001775 src = FIB_RES_PREFSRC(res);
1776 fib_res_put(&res);
1777 } else
1778 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1779 RT_SCOPE_UNIVERSE);
1780 memcpy(addr, &src, 4);
1781}
1782
1783#ifdef CONFIG_NET_CLS_ROUTE
1784static void set_class_tag(struct rtable *rt, u32 tag)
1785{
1786 if (!(rt->u.dst.tclassid & 0xFFFF))
1787 rt->u.dst.tclassid |= tag & 0xFFFF;
1788 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1789 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1790}
1791#endif
1792
1793static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1794{
1795 struct fib_info *fi = res->fi;
1796
1797 if (fi) {
1798 if (FIB_RES_GW(*res) &&
1799 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1800 rt->rt_gateway = FIB_RES_GW(*res);
1801 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1802 sizeof(rt->u.dst.metrics));
1803 if (fi->fib_mtu == 0) {
1804 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
Satoru SATOH0bbeafd2008-05-04 22:12:43 -07001805 if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001806 rt->rt_gateway != rt->rt_dst &&
1807 rt->u.dst.dev->mtu > 576)
1808 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1809 }
1810#ifdef CONFIG_NET_CLS_ROUTE
1811 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1812#endif
1813 } else
1814 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1815
Satoru SATOH5ffc02a2008-05-04 22:14:42 -07001816 if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001817 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
Rami Rosen6d273f82008-08-06 02:33:49 -07001818 if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001819 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
Satoru SATOH5ffc02a2008-05-04 22:14:42 -07001820 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001821 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1822 ip_rt_min_advmss);
Satoru SATOH5ffc02a2008-05-04 22:14:42 -07001823 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001824 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1825
1826#ifdef CONFIG_NET_CLS_ROUTE
1827#ifdef CONFIG_IP_MULTIPLE_TABLES
1828 set_class_tag(rt, fib_rules_tclass(res));
1829#endif
1830 set_class_tag(rt, itag);
1831#endif
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001832 rt->rt_type = res->type;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001833}
1834
Al Viro9e12bb22006-09-26 21:25:20 -07001835static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001836 u8 tos, struct net_device *dev, int our)
1837{
1838 unsigned hash;
1839 struct rtable *rth;
Al Viroa61ced52006-09-26 21:27:54 -07001840 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001841 struct in_device *in_dev = in_dev_get(dev);
1842 u32 itag = 0;
1843
1844 /* Primary sanity checks. */
1845
1846 if (in_dev == NULL)
1847 return -EINVAL;
1848
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001849 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08001850 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001851 goto e_inval;
1852
Joe Perchesf97c1e02007-12-16 13:45:43 -08001853 if (ipv4_is_zeronet(saddr)) {
1854 if (!ipv4_is_local_multicast(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001855 goto e_inval;
1856 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1857 } else if (fib_validate_source(saddr, 0, tos, 0,
jamalb0c110c2009-10-18 02:12:33 +00001858 dev, &spec_dst, &itag, 0) < 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001859 goto e_inval;
1860
1861 rth = dst_alloc(&ipv4_dst_ops);
1862 if (!rth)
1863 goto e_nobufs;
1864
1865 rth->u.dst.output= ip_rt_bug;
1866
1867 atomic_set(&rth->u.dst.__refcnt, 1);
1868 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07001869 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001870 rth->u.dst.flags |= DST_NOPOLICY;
1871 rth->fl.fl4_dst = daddr;
1872 rth->rt_dst = daddr;
1873 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001874 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001875 rth->fl.fl4_src = saddr;
1876 rth->rt_src = saddr;
1877#ifdef CONFIG_NET_CLS_ROUTE
1878 rth->u.dst.tclassid = itag;
1879#endif
1880 rth->rt_iif =
1881 rth->fl.iif = dev->ifindex;
Eric W. Biederman2774c7a2007-09-26 22:10:56 -07001882 rth->u.dst.dev = init_net.loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001883 dev_hold(rth->u.dst.dev);
1884 rth->idev = in_dev_get(rth->u.dst.dev);
1885 rth->fl.oif = 0;
1886 rth->rt_gateway = daddr;
1887 rth->rt_spec_dst= spec_dst;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001888 rth->rt_genid = rt_genid(dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001889 rth->rt_flags = RTCF_MULTICAST;
Eric Dumazet29e75252008-01-31 17:05:09 -08001890 rth->rt_type = RTN_MULTICAST;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001891 if (our) {
1892 rth->u.dst.input= ip_local_deliver;
1893 rth->rt_flags |= RTCF_LOCAL;
1894 }
1895
1896#ifdef CONFIG_IP_MROUTE
Joe Perchesf97c1e02007-12-16 13:45:43 -08001897 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001898 rth->u.dst.input = ip_mr_input;
1899#endif
1900 RT_CACHE_STAT_INC(in_slow_mc);
1901
1902 in_dev_put(in_dev);
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001903 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
Eric Dumazet511c3f92009-06-02 05:14:27 +00001904 return rt_intern_hash(hash, rth, NULL, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001905
1906e_nobufs:
1907 in_dev_put(in_dev);
1908 return -ENOBUFS;
1909
1910e_inval:
1911 in_dev_put(in_dev);
1912 return -EINVAL;
1913}
1914
1915
1916static void ip_handle_martian_source(struct net_device *dev,
1917 struct in_device *in_dev,
1918 struct sk_buff *skb,
Al Viro9e12bb22006-09-26 21:25:20 -07001919 __be32 daddr,
1920 __be32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001921{
1922 RT_CACHE_STAT_INC(in_martian_src);
1923#ifdef CONFIG_IP_ROUTE_VERBOSE
1924 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1925 /*
1926 * RFC1812 recommendation, if source is martian,
1927 * the only hint is MAC header.
1928 */
Harvey Harrison673d57e2008-10-31 00:53:57 -07001929 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1930 &daddr, &saddr, dev->name);
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001931 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001932 int i;
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001933 const unsigned char *p = skb_mac_header(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001934 printk(KERN_WARNING "ll header: ");
1935 for (i = 0; i < dev->hard_header_len; i++, p++) {
1936 printk("%02x", *p);
1937 if (i < (dev->hard_header_len - 1))
1938 printk(":");
1939 }
1940 printk("\n");
1941 }
1942 }
1943#endif
1944}
1945
Stephen Hemminger5969f712008-04-10 01:52:09 -07001946static int __mkroute_input(struct sk_buff *skb,
1947 struct fib_result *res,
1948 struct in_device *in_dev,
1949 __be32 daddr, __be32 saddr, u32 tos,
1950 struct rtable **result)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001951{
1952
1953 struct rtable *rth;
1954 int err;
1955 struct in_device *out_dev;
1956 unsigned flags = 0;
Al Virod9c9df82006-09-26 21:28:14 -07001957 __be32 spec_dst;
1958 u32 itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001959
1960 /* get a working reference to the output device */
1961 out_dev = in_dev_get(FIB_RES_DEV(*res));
1962 if (out_dev == NULL) {
1963 if (net_ratelimit())
1964 printk(KERN_CRIT "Bug in ip_route_input" \
1965 "_slow(). Please, report\n");
1966 return -EINVAL;
1967 }
1968
1969
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001970 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
jamalb0c110c2009-10-18 02:12:33 +00001971 in_dev->dev, &spec_dst, &itag, skb->mark);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001972 if (err < 0) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001973 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001974 saddr);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001975
Linus Torvalds1da177e2005-04-16 15:20:36 -07001976 err = -EINVAL;
1977 goto cleanup;
1978 }
1979
1980 if (err)
1981 flags |= RTCF_DIRECTSRC;
1982
Thomas Graf51b77ca2008-06-03 16:36:01 -07001983 if (out_dev == in_dev && err &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001984 (IN_DEV_SHARED_MEDIA(out_dev) ||
1985 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1986 flags |= RTCF_DOREDIRECT;
1987
1988 if (skb->protocol != htons(ETH_P_IP)) {
1989 /* Not IP (i.e. ARP). Do not create route, if it is
1990 * invalid for proxy arp. DNAT routes are always valid.
1991 */
Rami Rosencb7928a2008-01-09 00:18:24 -08001992 if (out_dev == in_dev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001993 err = -EINVAL;
1994 goto cleanup;
1995 }
1996 }
1997
1998
1999 rth = dst_alloc(&ipv4_dst_ops);
2000 if (!rth) {
2001 err = -ENOBUFS;
2002 goto cleanup;
2003 }
2004
Julian Anastasovce723d82005-09-08 13:34:47 -07002005 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002006 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07002007 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002008 rth->u.dst.flags |= DST_NOPOLICY;
Herbert Xu42f811b2007-06-04 23:34:44 -07002009 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002010 rth->u.dst.flags |= DST_NOXFRM;
2011 rth->fl.fl4_dst = daddr;
2012 rth->rt_dst = daddr;
2013 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002014 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002015 rth->fl.fl4_src = saddr;
2016 rth->rt_src = saddr;
2017 rth->rt_gateway = daddr;
2018 rth->rt_iif =
2019 rth->fl.iif = in_dev->dev->ifindex;
2020 rth->u.dst.dev = (out_dev)->dev;
2021 dev_hold(rth->u.dst.dev);
2022 rth->idev = in_dev_get(rth->u.dst.dev);
2023 rth->fl.oif = 0;
2024 rth->rt_spec_dst= spec_dst;
2025
2026 rth->u.dst.input = ip_forward;
2027 rth->u.dst.output = ip_output;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002028 rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002029
2030 rt_set_nexthop(rth, res, itag);
2031
2032 rth->rt_flags = flags;
2033
2034 *result = rth;
2035 err = 0;
2036 cleanup:
2037 /* release the working reference to the output device */
2038 in_dev_put(out_dev);
2039 return err;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002040}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002041
Stephen Hemminger5969f712008-04-10 01:52:09 -07002042static int ip_mkroute_input(struct sk_buff *skb,
2043 struct fib_result *res,
2044 const struct flowi *fl,
2045 struct in_device *in_dev,
2046 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002047{
Chuck Short7abaa272005-06-22 22:10:23 -07002048 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002049 int err;
2050 unsigned hash;
2051
2052#ifdef CONFIG_IP_ROUTE_MULTIPATH
2053 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2054 fib_select_multipath(fl, res);
2055#endif
2056
2057 /* create a routing cache entry */
2058 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2059 if (err)
2060 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002061
2062 /* put it into the cache */
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002063 hash = rt_hash(daddr, saddr, fl->iif,
2064 rt_genid(dev_net(rth->u.dst.dev)));
Eric Dumazet511c3f92009-06-02 05:14:27 +00002065 return rt_intern_hash(hash, rth, NULL, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002066}
2067
Linus Torvalds1da177e2005-04-16 15:20:36 -07002068/*
2069 * NOTE. We drop all the packets that has local source
2070 * addresses, because every properly looped back packet
2071 * must have correct destination already attached by output routine.
2072 *
2073 * Such approach solves two big problems:
2074 * 1. Not simplex devices are handled properly.
2075 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2076 */
2077
Al Viro9e12bb22006-09-26 21:25:20 -07002078static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002079 u8 tos, struct net_device *dev)
2080{
2081 struct fib_result res;
2082 struct in_device *in_dev = in_dev_get(dev);
2083 struct flowi fl = { .nl_u = { .ip4_u =
2084 { .daddr = daddr,
2085 .saddr = saddr,
2086 .tos = tos,
2087 .scope = RT_SCOPE_UNIVERSE,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002088 } },
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002089 .mark = skb->mark,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002090 .iif = dev->ifindex };
2091 unsigned flags = 0;
2092 u32 itag = 0;
2093 struct rtable * rth;
2094 unsigned hash;
Al Viro9e12bb22006-09-26 21:25:20 -07002095 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002096 int err = -EINVAL;
2097 int free_res = 0;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002098 struct net * net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002099
2100 /* IP on this device is disabled. */
2101
2102 if (!in_dev)
2103 goto out;
2104
2105 /* Check for the most weird martians, which can be not detected
2106 by fib_lookup.
2107 */
2108
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002109 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002110 ipv4_is_loopback(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002111 goto martian_source;
2112
Al Viroe4485152006-09-26 22:15:01 -07002113 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002114 goto brd_input;
2115
2116 /* Accept zero addresses only to limited broadcast;
2117 * I even do not know to fix it or not. Waiting for complains :-)
2118 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002119 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002120 goto martian_source;
2121
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002122 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002123 ipv4_is_loopback(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002124 goto martian_destination;
2125
2126 /*
2127 * Now we are ready to route packet.
2128 */
Denis V. Lunev84a885f2008-01-21 17:34:35 -08002129 if ((err = fib_lookup(net, &fl, &res)) != 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002130 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002131 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002132 goto no_route;
2133 }
2134 free_res = 1;
2135
2136 RT_CACHE_STAT_INC(in_slow_tot);
2137
2138 if (res.type == RTN_BROADCAST)
2139 goto brd_input;
2140
2141 if (res.type == RTN_LOCAL) {
2142 int result;
2143 result = fib_validate_source(saddr, daddr, tos,
Denis V. Lunev84a885f2008-01-21 17:34:35 -08002144 net->loopback_dev->ifindex,
jamalb0c110c2009-10-18 02:12:33 +00002145 dev, &spec_dst, &itag, skb->mark);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002146 if (result < 0)
2147 goto martian_source;
2148 if (result)
2149 flags |= RTCF_DIRECTSRC;
2150 spec_dst = daddr;
2151 goto local_input;
2152 }
2153
2154 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002155 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002156 if (res.type != RTN_UNICAST)
2157 goto martian_destination;
2158
2159 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002160done:
2161 in_dev_put(in_dev);
2162 if (free_res)
2163 fib_res_put(&res);
2164out: return err;
2165
2166brd_input:
2167 if (skb->protocol != htons(ETH_P_IP))
2168 goto e_inval;
2169
Joe Perchesf97c1e02007-12-16 13:45:43 -08002170 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002171 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2172 else {
2173 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
jamalb0c110c2009-10-18 02:12:33 +00002174 &itag, skb->mark);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002175 if (err < 0)
2176 goto martian_source;
2177 if (err)
2178 flags |= RTCF_DIRECTSRC;
2179 }
2180 flags |= RTCF_BROADCAST;
2181 res.type = RTN_BROADCAST;
2182 RT_CACHE_STAT_INC(in_brd);
2183
2184local_input:
2185 rth = dst_alloc(&ipv4_dst_ops);
2186 if (!rth)
2187 goto e_nobufs;
2188
2189 rth->u.dst.output= ip_rt_bug;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002190 rth->rt_genid = rt_genid(net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002191
2192 atomic_set(&rth->u.dst.__refcnt, 1);
2193 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07002194 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002195 rth->u.dst.flags |= DST_NOPOLICY;
2196 rth->fl.fl4_dst = daddr;
2197 rth->rt_dst = daddr;
2198 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002199 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002200 rth->fl.fl4_src = saddr;
2201 rth->rt_src = saddr;
2202#ifdef CONFIG_NET_CLS_ROUTE
2203 rth->u.dst.tclassid = itag;
2204#endif
2205 rth->rt_iif =
2206 rth->fl.iif = dev->ifindex;
Denis V. Lunev84a885f2008-01-21 17:34:35 -08002207 rth->u.dst.dev = net->loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002208 dev_hold(rth->u.dst.dev);
2209 rth->idev = in_dev_get(rth->u.dst.dev);
2210 rth->rt_gateway = daddr;
2211 rth->rt_spec_dst= spec_dst;
2212 rth->u.dst.input= ip_local_deliver;
2213 rth->rt_flags = flags|RTCF_LOCAL;
2214 if (res.type == RTN_UNREACHABLE) {
2215 rth->u.dst.input= ip_error;
2216 rth->u.dst.error= -err;
2217 rth->rt_flags &= ~RTCF_LOCAL;
2218 }
2219 rth->rt_type = res.type;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002220 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
Eric Dumazet511c3f92009-06-02 05:14:27 +00002221 err = rt_intern_hash(hash, rth, NULL, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002222 goto done;
2223
2224no_route:
2225 RT_CACHE_STAT_INC(in_no_route);
2226 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2227 res.type = RTN_UNREACHABLE;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08002228 if (err == -ESRCH)
2229 err = -ENETUNREACH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002230 goto local_input;
2231
2232 /*
2233 * Do not cache martian addresses: they should be logged (RFC1812)
2234 */
2235martian_destination:
2236 RT_CACHE_STAT_INC(in_martian_dst);
2237#ifdef CONFIG_IP_ROUTE_VERBOSE
2238 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07002239 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2240 &daddr, &saddr, dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002241#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002242
2243e_hostunreach:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002244 err = -EHOSTUNREACH;
2245 goto done;
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002246
Linus Torvalds1da177e2005-04-16 15:20:36 -07002247e_inval:
2248 err = -EINVAL;
2249 goto done;
2250
2251e_nobufs:
2252 err = -ENOBUFS;
2253 goto done;
2254
2255martian_source:
2256 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2257 goto e_inval;
2258}
2259
Al Viro9e12bb22006-09-26 21:25:20 -07002260int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002261 u8 tos, struct net_device *dev)
2262{
2263 struct rtable * rth;
2264 unsigned hash;
2265 int iif = dev->ifindex;
Denis V. Lunevb5921912008-01-22 23:50:25 -08002266 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002267
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002268 net = dev_net(dev);
Neil Horman1080d702008-10-27 12:28:25 -07002269
2270 if (!rt_caching(net))
2271 goto skip_cache;
2272
Linus Torvalds1da177e2005-04-16 15:20:36 -07002273 tos &= IPTOS_RT_MASK;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002274 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002275
2276 rcu_read_lock();
2277 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08002278 rth = rcu_dereference(rth->u.dst.rt_next)) {
Stephen Hemmingerc0b8c322008-04-10 04:00:28 -07002279 if (((rth->fl.fl4_dst ^ daddr) |
2280 (rth->fl.fl4_src ^ saddr) |
2281 (rth->fl.iif ^ iif) |
2282 rth->fl.oif |
2283 (rth->fl.fl4_tos ^ tos)) == 0 &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002284 rth->fl.mark == skb->mark &&
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09002285 net_eq(dev_net(rth->u.dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002286 !rt_is_expired(rth)) {
Pavel Emelyanov03f49f32007-11-10 21:28:34 -08002287 dst_use(&rth->u.dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002288 RT_CACHE_STAT_INC(in_hit);
2289 rcu_read_unlock();
Eric Dumazetadf30902009-06-02 05:19:30 +00002290 skb_dst_set(skb, &rth->u.dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002291 return 0;
2292 }
2293 RT_CACHE_STAT_INC(in_hlist_search);
2294 }
2295 rcu_read_unlock();
2296
Neil Horman1080d702008-10-27 12:28:25 -07002297skip_cache:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002298 /* Multicast recognition logic is moved from route cache to here.
2299 The problem was that too many Ethernet cards have broken/missing
2300 hardware multicast filters :-( As result the host on multicasting
2301 network acquires a lot of useless route cache entries, sort of
2302 SDR messages from all the world. Now we try to get rid of them.
2303 Really, provided software IP multicast filter is organized
2304 reasonably (at least, hashed), it does not result in a slowdown
2305 comparing with route cache reject entries.
2306 Note, that multicast routers are not affected, because
2307 route cache entry is created eventually.
2308 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002309 if (ipv4_is_multicast(daddr)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002310 struct in_device *in_dev;
2311
2312 rcu_read_lock();
Herbert Xue5ed6392005-10-03 14:35:55 -07002313 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002314 int our = ip_check_mc(in_dev, daddr, saddr,
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07002315 ip_hdr(skb)->protocol);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002316 if (our
2317#ifdef CONFIG_IP_MROUTE
Joe Perches9d4fb272009-11-23 10:41:23 -08002318 ||
2319 (!ipv4_is_local_multicast(daddr) &&
2320 IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002321#endif
Joe Perches9d4fb272009-11-23 10:41:23 -08002322 ) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002323 rcu_read_unlock();
2324 return ip_route_input_mc(skb, daddr, saddr,
2325 tos, dev, our);
2326 }
2327 }
2328 rcu_read_unlock();
2329 return -EINVAL;
2330 }
2331 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2332}
2333
Stephen Hemminger5969f712008-04-10 01:52:09 -07002334static int __mkroute_output(struct rtable **result,
2335 struct fib_result *res,
2336 const struct flowi *fl,
2337 const struct flowi *oldflp,
2338 struct net_device *dev_out,
2339 unsigned flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002340{
2341 struct rtable *rth;
2342 struct in_device *in_dev;
2343 u32 tos = RT_FL_TOS(oldflp);
2344 int err = 0;
2345
Joe Perchesf97c1e02007-12-16 13:45:43 -08002346 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002347 return -EINVAL;
2348
Al Viroe4485152006-09-26 22:15:01 -07002349 if (fl->fl4_dst == htonl(0xFFFFFFFF))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002350 res->type = RTN_BROADCAST;
Joe Perchesf97c1e02007-12-16 13:45:43 -08002351 else if (ipv4_is_multicast(fl->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002352 res->type = RTN_MULTICAST;
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002353 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002354 return -EINVAL;
2355
2356 if (dev_out->flags & IFF_LOOPBACK)
2357 flags |= RTCF_LOCAL;
2358
2359 /* get work reference to inet device */
2360 in_dev = in_dev_get(dev_out);
2361 if (!in_dev)
2362 return -EINVAL;
2363
2364 if (res->type == RTN_BROADCAST) {
2365 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2366 if (res->fi) {
2367 fib_info_put(res->fi);
2368 res->fi = NULL;
2369 }
2370 } else if (res->type == RTN_MULTICAST) {
2371 flags |= RTCF_MULTICAST|RTCF_LOCAL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002372 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002373 oldflp->proto))
2374 flags &= ~RTCF_LOCAL;
2375 /* If multicast route do not exist use
2376 default one, but do not gateway in this case.
2377 Yes, it is hack.
2378 */
2379 if (res->fi && res->prefixlen < 4) {
2380 fib_info_put(res->fi);
2381 res->fi = NULL;
2382 }
2383 }
2384
2385
2386 rth = dst_alloc(&ipv4_dst_ops);
2387 if (!rth) {
2388 err = -ENOBUFS;
2389 goto cleanup;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002390 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002391
Julian Anastasovce723d82005-09-08 13:34:47 -07002392 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002393 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07002394 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002395 rth->u.dst.flags |= DST_NOXFRM;
Herbert Xu42f811b2007-06-04 23:34:44 -07002396 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002397 rth->u.dst.flags |= DST_NOPOLICY;
2398
2399 rth->fl.fl4_dst = oldflp->fl4_dst;
2400 rth->fl.fl4_tos = tos;
2401 rth->fl.fl4_src = oldflp->fl4_src;
2402 rth->fl.oif = oldflp->oif;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002403 rth->fl.mark = oldflp->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002404 rth->rt_dst = fl->fl4_dst;
2405 rth->rt_src = fl->fl4_src;
2406 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002407 /* get references to the devices that are to be hold by the routing
Linus Torvalds1da177e2005-04-16 15:20:36 -07002408 cache entry */
2409 rth->u.dst.dev = dev_out;
2410 dev_hold(dev_out);
2411 rth->idev = in_dev_get(dev_out);
2412 rth->rt_gateway = fl->fl4_dst;
2413 rth->rt_spec_dst= fl->fl4_src;
2414
2415 rth->u.dst.output=ip_output;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002416 rth->rt_genid = rt_genid(dev_net(dev_out));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002417
2418 RT_CACHE_STAT_INC(out_slow_tot);
2419
2420 if (flags & RTCF_LOCAL) {
2421 rth->u.dst.input = ip_local_deliver;
2422 rth->rt_spec_dst = fl->fl4_dst;
2423 }
2424 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2425 rth->rt_spec_dst = fl->fl4_src;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002426 if (flags & RTCF_LOCAL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002427 !(dev_out->flags & IFF_LOOPBACK)) {
2428 rth->u.dst.output = ip_mc_output;
2429 RT_CACHE_STAT_INC(out_slow_mc);
2430 }
2431#ifdef CONFIG_IP_MROUTE
2432 if (res->type == RTN_MULTICAST) {
2433 if (IN_DEV_MFORWARD(in_dev) &&
Joe Perchesf97c1e02007-12-16 13:45:43 -08002434 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002435 rth->u.dst.input = ip_mr_input;
2436 rth->u.dst.output = ip_mc_output;
2437 }
2438 }
2439#endif
2440 }
2441
2442 rt_set_nexthop(rth, res, 0);
2443
2444 rth->rt_flags = flags;
2445
2446 *result = rth;
2447 cleanup:
2448 /* release work reference to inet device */
2449 in_dev_put(in_dev);
2450
2451 return err;
2452}
2453
Stephen Hemminger5969f712008-04-10 01:52:09 -07002454static int ip_mkroute_output(struct rtable **rp,
2455 struct fib_result *res,
2456 const struct flowi *fl,
2457 const struct flowi *oldflp,
2458 struct net_device *dev_out,
2459 unsigned flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002460{
Chuck Short7abaa272005-06-22 22:10:23 -07002461 struct rtable *rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002462 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2463 unsigned hash;
2464 if (err == 0) {
Denis V. Lunevb00180d2008-07-05 19:04:09 -07002465 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002466 rt_genid(dev_net(dev_out)));
Eric Dumazet511c3f92009-06-02 05:14:27 +00002467 err = rt_intern_hash(hash, rth, rp, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002468 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002469
Linus Torvalds1da177e2005-04-16 15:20:36 -07002470 return err;
2471}
2472
Linus Torvalds1da177e2005-04-16 15:20:36 -07002473/*
2474 * Major route resolver routine.
2475 */
2476
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002477static int ip_route_output_slow(struct net *net, struct rtable **rp,
2478 const struct flowi *oldflp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002479{
2480 u32 tos = RT_FL_TOS(oldflp);
2481 struct flowi fl = { .nl_u = { .ip4_u =
2482 { .daddr = oldflp->fl4_dst,
2483 .saddr = oldflp->fl4_src,
2484 .tos = tos & IPTOS_RT_MASK,
2485 .scope = ((tos & RTO_ONLINK) ?
2486 RT_SCOPE_LINK :
2487 RT_SCOPE_UNIVERSE),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002488 } },
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002489 .mark = oldflp->mark,
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002490 .iif = net->loopback_dev->ifindex,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002491 .oif = oldflp->oif };
2492 struct fib_result res;
2493 unsigned flags = 0;
2494 struct net_device *dev_out = NULL;
2495 int free_res = 0;
2496 int err;
2497
2498
2499 res.fi = NULL;
2500#ifdef CONFIG_IP_MULTIPLE_TABLES
2501 res.r = NULL;
2502#endif
2503
2504 if (oldflp->fl4_src) {
2505 err = -EINVAL;
Joe Perchesf97c1e02007-12-16 13:45:43 -08002506 if (ipv4_is_multicast(oldflp->fl4_src) ||
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002507 ipv4_is_lbcast(oldflp->fl4_src) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002508 ipv4_is_zeronet(oldflp->fl4_src))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002509 goto out;
2510
Linus Torvalds1da177e2005-04-16 15:20:36 -07002511 /* I removed check for oif == dev_out->oif here.
2512 It was wrong for two reasons:
Denis V. Lunev1ab35272008-01-22 22:04:30 -08002513 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2514 is assigned to multiple interfaces.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002515 2. Moreover, we are allowed to send packets with saddr
2516 of another iface. --ANK
2517 */
2518
Joe Perches9d4fb272009-11-23 10:41:23 -08002519 if (oldflp->oif == 0 &&
2520 (ipv4_is_multicast(oldflp->fl4_dst) ||
2521 oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
Julian Anastasova210d012008-10-01 07:28:28 -07002522 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2523 dev_out = ip_dev_find(net, oldflp->fl4_src);
2524 if (dev_out == NULL)
2525 goto out;
2526
Linus Torvalds1da177e2005-04-16 15:20:36 -07002527 /* Special hack: user can direct multicasts
2528 and limited broadcast via necessary interface
2529 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2530 This hack is not just for fun, it allows
2531 vic,vat and friends to work.
2532 They bind socket to loopback, set ttl to zero
2533 and expect that it will work.
2534 From the viewpoint of routing cache they are broken,
2535 because we are not allowed to build multicast path
2536 with loopback source addr (look, routing cache
2537 cannot know, that ttl is zero, so that packet
2538 will not leave this host and route is valid).
2539 Luckily, this hack is good workaround.
2540 */
2541
2542 fl.oif = dev_out->ifindex;
2543 goto make_route;
2544 }
Julian Anastasova210d012008-10-01 07:28:28 -07002545
2546 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2547 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2548 dev_out = ip_dev_find(net, oldflp->fl4_src);
2549 if (dev_out == NULL)
2550 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002551 dev_put(dev_out);
Julian Anastasova210d012008-10-01 07:28:28 -07002552 dev_out = NULL;
2553 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002554 }
2555
2556
2557 if (oldflp->oif) {
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002558 dev_out = dev_get_by_index(net, oldflp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002559 err = -ENODEV;
2560 if (dev_out == NULL)
2561 goto out;
Herbert Xue5ed6392005-10-03 14:35:55 -07002562
2563 /* RACE: Check return value of inet_select_addr instead. */
2564 if (__in_dev_get_rtnl(dev_out) == NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002565 dev_put(dev_out);
2566 goto out; /* Wrong error code */
2567 }
2568
Joe Perchesf97c1e02007-12-16 13:45:43 -08002569 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2570 oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002571 if (!fl.fl4_src)
2572 fl.fl4_src = inet_select_addr(dev_out, 0,
2573 RT_SCOPE_LINK);
2574 goto make_route;
2575 }
2576 if (!fl.fl4_src) {
Joe Perchesf97c1e02007-12-16 13:45:43 -08002577 if (ipv4_is_multicast(oldflp->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002578 fl.fl4_src = inet_select_addr(dev_out, 0,
2579 fl.fl4_scope);
2580 else if (!oldflp->fl4_dst)
2581 fl.fl4_src = inet_select_addr(dev_out, 0,
2582 RT_SCOPE_HOST);
2583 }
2584 }
2585
2586 if (!fl.fl4_dst) {
2587 fl.fl4_dst = fl.fl4_src;
2588 if (!fl.fl4_dst)
2589 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2590 if (dev_out)
2591 dev_put(dev_out);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002592 dev_out = net->loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002593 dev_hold(dev_out);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002594 fl.oif = net->loopback_dev->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002595 res.type = RTN_LOCAL;
2596 flags |= RTCF_LOCAL;
2597 goto make_route;
2598 }
2599
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002600 if (fib_lookup(net, &fl, &res)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002601 res.fi = NULL;
2602 if (oldflp->oif) {
2603 /* Apparently, routing tables are wrong. Assume,
2604 that the destination is on link.
2605
2606 WHY? DW.
2607 Because we are allowed to send to iface
2608 even if it has NO routes and NO assigned
2609 addresses. When oif is specified, routing
2610 tables are looked up with only one purpose:
2611 to catch if destination is gatewayed, rather than
2612 direct. Moreover, if MSG_DONTROUTE is set,
2613 we send packet, ignoring both routing tables
2614 and ifaddr state. --ANK
2615
2616
2617 We could make it even if oif is unknown,
2618 likely IPv6, but we do not.
2619 */
2620
2621 if (fl.fl4_src == 0)
2622 fl.fl4_src = inet_select_addr(dev_out, 0,
2623 RT_SCOPE_LINK);
2624 res.type = RTN_UNICAST;
2625 goto make_route;
2626 }
2627 if (dev_out)
2628 dev_put(dev_out);
2629 err = -ENETUNREACH;
2630 goto out;
2631 }
2632 free_res = 1;
2633
2634 if (res.type == RTN_LOCAL) {
2635 if (!fl.fl4_src)
2636 fl.fl4_src = fl.fl4_dst;
2637 if (dev_out)
2638 dev_put(dev_out);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002639 dev_out = net->loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002640 dev_hold(dev_out);
2641 fl.oif = dev_out->ifindex;
2642 if (res.fi)
2643 fib_info_put(res.fi);
2644 res.fi = NULL;
2645 flags |= RTCF_LOCAL;
2646 goto make_route;
2647 }
2648
2649#ifdef CONFIG_IP_ROUTE_MULTIPATH
2650 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2651 fib_select_multipath(&fl, &res);
2652 else
2653#endif
2654 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002655 fib_select_default(net, &fl, &res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002656
2657 if (!fl.fl4_src)
2658 fl.fl4_src = FIB_RES_PREFSRC(res);
2659
2660 if (dev_out)
2661 dev_put(dev_out);
2662 dev_out = FIB_RES_DEV(res);
2663 dev_hold(dev_out);
2664 fl.oif = dev_out->ifindex;
2665
2666
2667make_route:
2668 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2669
2670
2671 if (free_res)
2672 fib_res_put(&res);
2673 if (dev_out)
2674 dev_put(dev_out);
2675out: return err;
2676}
2677
Denis V. Lunev611c1832008-01-22 22:06:48 -08002678int __ip_route_output_key(struct net *net, struct rtable **rp,
2679 const struct flowi *flp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002680{
2681 unsigned hash;
2682 struct rtable *rth;
2683
Neil Horman1080d702008-10-27 12:28:25 -07002684 if (!rt_caching(net))
2685 goto slow_output;
2686
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002687 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002688
2689 rcu_read_lock_bh();
2690 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08002691 rth = rcu_dereference(rth->u.dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002692 if (rth->fl.fl4_dst == flp->fl4_dst &&
2693 rth->fl.fl4_src == flp->fl4_src &&
2694 rth->fl.iif == 0 &&
2695 rth->fl.oif == flp->oif &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002696 rth->fl.mark == flp->mark &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002697 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
Denis V. Lunevb5921912008-01-22 23:50:25 -08002698 (IPTOS_RT_MASK | RTO_ONLINK)) &&
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09002699 net_eq(dev_net(rth->u.dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002700 !rt_is_expired(rth)) {
Pavel Emelyanov03f49f32007-11-10 21:28:34 -08002701 dst_use(&rth->u.dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002702 RT_CACHE_STAT_INC(out_hit);
2703 rcu_read_unlock_bh();
2704 *rp = rth;
2705 return 0;
2706 }
2707 RT_CACHE_STAT_INC(out_hlist_search);
2708 }
2709 rcu_read_unlock_bh();
2710
Neil Horman1080d702008-10-27 12:28:25 -07002711slow_output:
Denis V. Lunev611c1832008-01-22 22:06:48 -08002712 return ip_route_output_slow(net, rp, flp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002713}
2714
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002715EXPORT_SYMBOL_GPL(__ip_route_output_key);
2716
David S. Miller14e50e52007-05-24 18:17:54 -07002717static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2718{
2719}
2720
2721static struct dst_ops ipv4_dst_blackhole_ops = {
2722 .family = AF_INET,
Harvey Harrison09640e62009-02-01 00:45:17 -08002723 .protocol = cpu_to_be16(ETH_P_IP),
David S. Miller14e50e52007-05-24 18:17:54 -07002724 .destroy = ipv4_dst_destroy,
2725 .check = ipv4_dst_check,
2726 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
Eric Dumazete2422972008-01-30 20:07:45 -08002727 .entries = ATOMIC_INIT(0),
David S. Miller14e50e52007-05-24 18:17:54 -07002728};
2729
2730
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002731static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
David S. Miller14e50e52007-05-24 18:17:54 -07002732{
2733 struct rtable *ort = *rp;
2734 struct rtable *rt = (struct rtable *)
2735 dst_alloc(&ipv4_dst_blackhole_ops);
2736
2737 if (rt) {
2738 struct dst_entry *new = &rt->u.dst;
2739
2740 atomic_set(&new->__refcnt, 1);
2741 new->__use = 1;
Herbert Xu352e5122007-11-13 21:34:06 -08002742 new->input = dst_discard;
2743 new->output = dst_discard;
David S. Miller14e50e52007-05-24 18:17:54 -07002744 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2745
2746 new->dev = ort->u.dst.dev;
2747 if (new->dev)
2748 dev_hold(new->dev);
2749
2750 rt->fl = ort->fl;
2751
2752 rt->idev = ort->idev;
2753 if (rt->idev)
2754 in_dev_hold(rt->idev);
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002755 rt->rt_genid = rt_genid(net);
David S. Miller14e50e52007-05-24 18:17:54 -07002756 rt->rt_flags = ort->rt_flags;
2757 rt->rt_type = ort->rt_type;
2758 rt->rt_dst = ort->rt_dst;
2759 rt->rt_src = ort->rt_src;
2760 rt->rt_iif = ort->rt_iif;
2761 rt->rt_gateway = ort->rt_gateway;
2762 rt->rt_spec_dst = ort->rt_spec_dst;
2763 rt->peer = ort->peer;
2764 if (rt->peer)
2765 atomic_inc(&rt->peer->refcnt);
2766
2767 dst_free(new);
2768 }
2769
2770 dst_release(&(*rp)->u.dst);
2771 *rp = rt;
2772 return (rt ? 0 : -ENOMEM);
2773}
2774
Denis V. Lunevf1b050b2008-01-22 22:07:10 -08002775int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2776 struct sock *sk, int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002777{
2778 int err;
2779
Denis V. Lunevf1b050b2008-01-22 22:07:10 -08002780 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002781 return err;
2782
2783 if (flp->proto) {
2784 if (!flp->fl4_src)
2785 flp->fl4_src = (*rp)->rt_src;
2786 if (!flp->fl4_dst)
2787 flp->fl4_dst = (*rp)->rt_dst;
Alexey Dobriyan52479b62008-11-25 17:35:18 -08002788 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
Herbert Xubb728452007-12-12 18:48:58 -08002789 flags ? XFRM_LOOKUP_WAIT : 0);
David S. Miller14e50e52007-05-24 18:17:54 -07002790 if (err == -EREMOTE)
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002791 err = ipv4_dst_blackhole(net, rp, flp);
David S. Miller14e50e52007-05-24 18:17:54 -07002792
2793 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002794 }
2795
2796 return 0;
2797}
2798
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002799EXPORT_SYMBOL_GPL(ip_route_output_flow);
2800
Denis V. Lunevf2063512008-01-22 22:07:34 -08002801int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002802{
Denis V. Lunevf2063512008-01-22 22:07:34 -08002803 return ip_route_output_flow(net, rp, flp, NULL, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002804}
2805
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002806static int rt_fill_info(struct net *net,
2807 struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002808 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002809{
Eric Dumazet511c3f92009-06-02 05:14:27 +00002810 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002811 struct rtmsg *r;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002812 struct nlmsghdr *nlh;
Thomas Grafe3703b32006-11-27 09:27:07 -08002813 long expires;
2814 u32 id = 0, ts = 0, tsage = 0, error;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002815
2816 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2817 if (nlh == NULL)
Patrick McHardy26932562007-01-31 23:16:40 -08002818 return -EMSGSIZE;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002819
2820 r = nlmsg_data(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002821 r->rtm_family = AF_INET;
2822 r->rtm_dst_len = 32;
2823 r->rtm_src_len = 0;
2824 r->rtm_tos = rt->fl.fl4_tos;
2825 r->rtm_table = RT_TABLE_MAIN;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002826 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002827 r->rtm_type = rt->rt_type;
2828 r->rtm_scope = RT_SCOPE_UNIVERSE;
2829 r->rtm_protocol = RTPROT_UNSPEC;
2830 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2831 if (rt->rt_flags & RTCF_NOTIFY)
2832 r->rtm_flags |= RTM_F_NOTIFY;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002833
Al Viro17fb2c62006-09-26 22:15:25 -07002834 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002835
Linus Torvalds1da177e2005-04-16 15:20:36 -07002836 if (rt->fl.fl4_src) {
2837 r->rtm_src_len = 32;
Al Viro17fb2c62006-09-26 22:15:25 -07002838 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002839 }
2840 if (rt->u.dst.dev)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002841 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002842#ifdef CONFIG_NET_CLS_ROUTE
2843 if (rt->u.dst.tclassid)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002844 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002845#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002846 if (rt->fl.iif)
Al Viro17fb2c62006-09-26 22:15:25 -07002847 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002848 else if (rt->rt_src != rt->fl.fl4_src)
Al Viro17fb2c62006-09-26 22:15:25 -07002849 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002850
Linus Torvalds1da177e2005-04-16 15:20:36 -07002851 if (rt->rt_dst != rt->rt_gateway)
Al Viro17fb2c62006-09-26 22:15:25 -07002852 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002853
Linus Torvalds1da177e2005-04-16 15:20:36 -07002854 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002855 goto nla_put_failure;
2856
Thomas Grafe3703b32006-11-27 09:27:07 -08002857 error = rt->u.dst.error;
2858 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002859 if (rt->peer) {
Eric Dumazet2c1409a2009-11-12 09:33:09 +00002860 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002861 if (rt->peer->tcp_ts_stamp) {
Thomas Grafe3703b32006-11-27 09:27:07 -08002862 ts = rt->peer->tcp_ts;
James Morris9d729f72007-03-04 16:12:44 -08002863 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002864 }
2865 }
Thomas Grafbe403ea2006-08-17 18:15:17 -07002866
Linus Torvalds1da177e2005-04-16 15:20:36 -07002867 if (rt->fl.iif) {
2868#ifdef CONFIG_IP_MROUTE
Al Viroe4485152006-09-26 22:15:01 -07002869 __be32 dst = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002870
Joe Perchesf97c1e02007-12-16 13:45:43 -08002871 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002872 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2873 int err = ipmr_get_route(net, skb, r, nowait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002874 if (err <= 0) {
2875 if (!nowait) {
2876 if (err == 0)
2877 return 0;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002878 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002879 } else {
2880 if (err == -EMSGSIZE)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002881 goto nla_put_failure;
Thomas Grafe3703b32006-11-27 09:27:07 -08002882 error = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002883 }
2884 }
2885 } else
2886#endif
Thomas Grafbe403ea2006-08-17 18:15:17 -07002887 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002888 }
2889
Thomas Grafe3703b32006-11-27 09:27:07 -08002890 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2891 expires, error) < 0)
2892 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002893
Thomas Grafbe403ea2006-08-17 18:15:17 -07002894 return nlmsg_end(skb, nlh);
2895
2896nla_put_failure:
Patrick McHardy26932562007-01-31 23:16:40 -08002897 nlmsg_cancel(skb, nlh);
2898 return -EMSGSIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002899}
2900
Thomas Graf63f34442007-03-22 11:55:17 -07002901static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002902{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09002903 struct net *net = sock_net(in_skb->sk);
Thomas Grafd889ce32006-08-17 18:15:44 -07002904 struct rtmsg *rtm;
2905 struct nlattr *tb[RTA_MAX+1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07002906 struct rtable *rt = NULL;
Al Viro9e12bb22006-09-26 21:25:20 -07002907 __be32 dst = 0;
2908 __be32 src = 0;
2909 u32 iif;
Thomas Grafd889ce32006-08-17 18:15:44 -07002910 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002911 struct sk_buff *skb;
2912
Thomas Grafd889ce32006-08-17 18:15:44 -07002913 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2914 if (err < 0)
2915 goto errout;
2916
2917 rtm = nlmsg_data(nlh);
2918
Linus Torvalds1da177e2005-04-16 15:20:36 -07002919 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
Thomas Grafd889ce32006-08-17 18:15:44 -07002920 if (skb == NULL) {
2921 err = -ENOBUFS;
2922 goto errout;
2923 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002924
2925 /* Reserve room for dummy headers, this skb can pass
2926 through good chunk of routing engine.
2927 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07002928 skb_reset_mac_header(skb);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07002929 skb_reset_network_header(skb);
Stephen Hemmingerd2c962b2006-04-17 17:27:11 -07002930
2931 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07002932 ip_hdr(skb)->protocol = IPPROTO_ICMP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002933 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2934
Al Viro17fb2c62006-09-26 22:15:25 -07002935 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2936 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
Thomas Grafd889ce32006-08-17 18:15:44 -07002937 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002938
2939 if (iif) {
Thomas Grafd889ce32006-08-17 18:15:44 -07002940 struct net_device *dev;
2941
Denis V. Lunev19375042008-02-28 20:52:04 -08002942 dev = __dev_get_by_index(net, iif);
Thomas Grafd889ce32006-08-17 18:15:44 -07002943 if (dev == NULL) {
2944 err = -ENODEV;
2945 goto errout_free;
2946 }
2947
Linus Torvalds1da177e2005-04-16 15:20:36 -07002948 skb->protocol = htons(ETH_P_IP);
2949 skb->dev = dev;
2950 local_bh_disable();
2951 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2952 local_bh_enable();
Thomas Grafd889ce32006-08-17 18:15:44 -07002953
Eric Dumazet511c3f92009-06-02 05:14:27 +00002954 rt = skb_rtable(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07002955 if (err == 0 && rt->u.dst.error)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002956 err = -rt->u.dst.error;
2957 } else {
Thomas Grafd889ce32006-08-17 18:15:44 -07002958 struct flowi fl = {
2959 .nl_u = {
2960 .ip4_u = {
2961 .daddr = dst,
2962 .saddr = src,
2963 .tos = rtm->rtm_tos,
2964 },
2965 },
2966 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2967 };
Denis V. Lunev19375042008-02-28 20:52:04 -08002968 err = ip_route_output_key(net, &rt, &fl);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002969 }
Thomas Grafd889ce32006-08-17 18:15:44 -07002970
Linus Torvalds1da177e2005-04-16 15:20:36 -07002971 if (err)
Thomas Grafd889ce32006-08-17 18:15:44 -07002972 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002973
Eric Dumazetadf30902009-06-02 05:19:30 +00002974 skb_dst_set(skb, &rt->u.dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002975 if (rtm->rtm_flags & RTM_F_NOTIFY)
2976 rt->rt_flags |= RTCF_NOTIFY;
2977
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002978 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Denis V. Lunev19375042008-02-28 20:52:04 -08002979 RTM_NEWROUTE, 0, 0);
Thomas Grafd889ce32006-08-17 18:15:44 -07002980 if (err <= 0)
2981 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002982
Denis V. Lunev19375042008-02-28 20:52:04 -08002983 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
Thomas Grafd889ce32006-08-17 18:15:44 -07002984errout:
Thomas Graf2942e902006-08-15 00:30:25 -07002985 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002986
Thomas Grafd889ce32006-08-17 18:15:44 -07002987errout_free:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002988 kfree_skb(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07002989 goto errout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002990}
2991
2992int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2993{
2994 struct rtable *rt;
2995 int h, s_h;
2996 int idx, s_idx;
Denis V. Lunev19375042008-02-28 20:52:04 -08002997 struct net *net;
2998
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09002999 net = sock_net(skb->sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003000
3001 s_h = cb->args[0];
Eric Dumazetd8c92832008-01-07 21:52:14 -08003002 if (s_h < 0)
3003 s_h = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003004 s_idx = idx = cb->args[1];
Eric Dumazeta6272662008-08-28 01:11:25 -07003005 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3006 if (!rt_hash_table[h].chain)
3007 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003008 rcu_read_lock_bh();
3009 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08003010 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09003011 if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003012 continue;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07003013 if (rt_is_expired(rt))
Eric Dumazet29e75252008-01-31 17:05:09 -08003014 continue;
Eric Dumazetadf30902009-06-02 05:19:30 +00003015 skb_dst_set(skb, dst_clone(&rt->u.dst));
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003016 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003017 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07003018 1, NLM_F_MULTI) <= 0) {
Eric Dumazetadf30902009-06-02 05:19:30 +00003019 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003020 rcu_read_unlock_bh();
3021 goto done;
3022 }
Eric Dumazetadf30902009-06-02 05:19:30 +00003023 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003024 }
3025 rcu_read_unlock_bh();
3026 }
3027
3028done:
3029 cb->args[0] = h;
3030 cb->args[1] = idx;
3031 return skb->len;
3032}
3033
3034void ip_rt_multicast_event(struct in_device *in_dev)
3035{
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -07003036 rt_cache_flush(dev_net(in_dev->dev), 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003037}
3038
3039#ifdef CONFIG_SYSCTL
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003040static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003041 void __user *buffer,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003042 size_t *lenp, loff_t *ppos)
3043{
3044 if (write) {
Denis V. Lunev639e1042008-07-05 19:02:06 -07003045 int flush_delay;
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003046 ctl_table ctl;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003047 struct net *net;
Denis V. Lunev639e1042008-07-05 19:02:06 -07003048
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003049 memcpy(&ctl, __ctl, sizeof(ctl));
3050 ctl.data = &flush_delay;
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003051 proc_dointvec(&ctl, write, buffer, lenp, ppos);
Denis V. Lunev639e1042008-07-05 19:02:06 -07003052
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003053 net = (struct net *)__ctl->extra1;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003054 rt_cache_flush(net, flush_delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003055 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003056 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003057
3058 return -EINVAL;
3059}
3060
3061static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003062 void __user *oldval,
3063 size_t __user *oldlenp,
3064 void __user *newval,
Alexey Dobriyan1f29bcd2006-12-10 02:19:10 -08003065 size_t newlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003066{
3067 int delay;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003068 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003069 if (newlen != sizeof(int))
3070 return -EINVAL;
3071 if (get_user(delay, (int __user *)newval))
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003072 return -EFAULT;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003073 net = (struct net *)table->extra1;
3074 rt_cache_flush(net, delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003075 return 0;
3076}
3077
Herbert Xuc6153b52008-08-15 13:44:31 -07003078static void rt_secret_reschedule(int old)
3079{
3080 struct net *net;
3081 int new = ip_rt_secret_interval;
3082 int diff = new - old;
3083
3084 if (!diff)
3085 return;
3086
3087 rtnl_lock();
3088 for_each_net(net) {
3089 int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
3090
3091 if (!new)
3092 continue;
3093
3094 if (deleted) {
3095 long time = net->ipv4.rt_secret_timer.expires - jiffies;
3096
3097 if (time <= 0 || (time += diff) <= 0)
3098 time = 0;
3099
3100 net->ipv4.rt_secret_timer.expires = time;
3101 } else
3102 net->ipv4.rt_secret_timer.expires = new;
3103
3104 net->ipv4.rt_secret_timer.expires += jiffies;
3105 add_timer(&net->ipv4.rt_secret_timer);
3106 }
3107 rtnl_unlock();
3108}
3109
3110static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
Herbert Xuc6153b52008-08-15 13:44:31 -07003111 void __user *buffer, size_t *lenp,
3112 loff_t *ppos)
3113{
3114 int old = ip_rt_secret_interval;
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003115 int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
Herbert Xuc6153b52008-08-15 13:44:31 -07003116
3117 rt_secret_reschedule(old);
3118
3119 return ret;
3120}
3121
3122static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table *table,
Herbert Xuc6153b52008-08-15 13:44:31 -07003123 void __user *oldval,
3124 size_t __user *oldlenp,
3125 void __user *newval,
3126 size_t newlen)
3127{
3128 int old = ip_rt_secret_interval;
Alexey Dobriyanf221e722008-10-15 22:04:23 -07003129 int ret = sysctl_jiffies(table, oldval, oldlenp, newval, newlen);
Herbert Xuc6153b52008-08-15 13:44:31 -07003130
3131 rt_secret_reschedule(old);
3132
3133 return ret;
3134}
3135
Al Viroeeb61f72008-07-27 08:59:33 +01003136static ctl_table ipv4_route_table[] = {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003137 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003138 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
3139 .procname = "gc_thresh",
3140 .data = &ipv4_dst_ops.gc_thresh,
3141 .maxlen = sizeof(int),
3142 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003143 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003144 },
3145 {
3146 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
3147 .procname = "max_size",
3148 .data = &ip_rt_max_size,
3149 .maxlen = sizeof(int),
3150 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003151 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003152 },
3153 {
3154 /* Deprecated. Use gc_min_interval_ms */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003155
Linus Torvalds1da177e2005-04-16 15:20:36 -07003156 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
3157 .procname = "gc_min_interval",
3158 .data = &ip_rt_gc_min_interval,
3159 .maxlen = sizeof(int),
3160 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003161 .proc_handler = proc_dointvec_jiffies,
3162 .strategy = sysctl_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003163 },
3164 {
3165 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
3166 .procname = "gc_min_interval_ms",
3167 .data = &ip_rt_gc_min_interval,
3168 .maxlen = sizeof(int),
3169 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003170 .proc_handler = proc_dointvec_ms_jiffies,
3171 .strategy = sysctl_ms_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003172 },
3173 {
3174 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
3175 .procname = "gc_timeout",
3176 .data = &ip_rt_gc_timeout,
3177 .maxlen = sizeof(int),
3178 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003179 .proc_handler = proc_dointvec_jiffies,
3180 .strategy = sysctl_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003181 },
3182 {
3183 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
3184 .procname = "gc_interval",
3185 .data = &ip_rt_gc_interval,
3186 .maxlen = sizeof(int),
3187 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003188 .proc_handler = proc_dointvec_jiffies,
3189 .strategy = sysctl_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003190 },
3191 {
3192 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
3193 .procname = "redirect_load",
3194 .data = &ip_rt_redirect_load,
3195 .maxlen = sizeof(int),
3196 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003197 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003198 },
3199 {
3200 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
3201 .procname = "redirect_number",
3202 .data = &ip_rt_redirect_number,
3203 .maxlen = sizeof(int),
3204 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003205 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003206 },
3207 {
3208 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
3209 .procname = "redirect_silence",
3210 .data = &ip_rt_redirect_silence,
3211 .maxlen = sizeof(int),
3212 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003213 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003214 },
3215 {
3216 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
3217 .procname = "error_cost",
3218 .data = &ip_rt_error_cost,
3219 .maxlen = sizeof(int),
3220 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003221 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003222 },
3223 {
3224 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
3225 .procname = "error_burst",
3226 .data = &ip_rt_error_burst,
3227 .maxlen = sizeof(int),
3228 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003229 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003230 },
3231 {
3232 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
3233 .procname = "gc_elasticity",
3234 .data = &ip_rt_gc_elasticity,
3235 .maxlen = sizeof(int),
3236 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003237 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003238 },
3239 {
3240 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
3241 .procname = "mtu_expires",
3242 .data = &ip_rt_mtu_expires,
3243 .maxlen = sizeof(int),
3244 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003245 .proc_handler = proc_dointvec_jiffies,
3246 .strategy = sysctl_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003247 },
3248 {
3249 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
3250 .procname = "min_pmtu",
3251 .data = &ip_rt_min_pmtu,
3252 .maxlen = sizeof(int),
3253 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003254 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003255 },
3256 {
3257 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
3258 .procname = "min_adv_mss",
3259 .data = &ip_rt_min_advmss,
3260 .maxlen = sizeof(int),
3261 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003262 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003263 },
3264 {
3265 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3266 .procname = "secret_interval",
3267 .data = &ip_rt_secret_interval,
3268 .maxlen = sizeof(int),
3269 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003270 .proc_handler = ipv4_sysctl_rt_secret_interval,
3271 .strategy = ipv4_sysctl_rt_secret_interval_strategy,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003272 },
3273 { .ctl_name = 0 }
3274};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003275
Al Viro2f4520d2008-08-25 15:17:44 -07003276static struct ctl_table empty[1];
3277
3278static struct ctl_table ipv4_skeleton[] =
3279{
3280 { .procname = "route", .ctl_name = NET_IPV4_ROUTE,
Hugh Dickinsd994af02008-08-27 02:35:18 -07003281 .mode = 0555, .child = ipv4_route_table},
Al Viro2f4520d2008-08-25 15:17:44 -07003282 { .procname = "neigh", .ctl_name = NET_IPV4_NEIGH,
Hugh Dickinsd994af02008-08-27 02:35:18 -07003283 .mode = 0555, .child = empty},
Al Viro2f4520d2008-08-25 15:17:44 -07003284 { }
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003285};
3286
Al Viro2f4520d2008-08-25 15:17:44 -07003287static __net_initdata struct ctl_path ipv4_path[] = {
3288 { .procname = "net", .ctl_name = CTL_NET, },
3289 { .procname = "ipv4", .ctl_name = NET_IPV4, },
3290 { },
3291};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003292
3293static struct ctl_table ipv4_route_flush_table[] = {
3294 {
3295 .ctl_name = NET_IPV4_ROUTE_FLUSH,
3296 .procname = "flush",
3297 .maxlen = sizeof(int),
3298 .mode = 0200,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003299 .proc_handler = ipv4_sysctl_rtcache_flush,
3300 .strategy = ipv4_sysctl_rtcache_flush_strategy,
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003301 },
3302 { .ctl_name = 0 },
3303};
3304
Al Viro2f4520d2008-08-25 15:17:44 -07003305static __net_initdata struct ctl_path ipv4_route_path[] = {
3306 { .procname = "net", .ctl_name = CTL_NET, },
3307 { .procname = "ipv4", .ctl_name = NET_IPV4, },
3308 { .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
3309 { },
3310};
3311
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003312static __net_init int sysctl_route_net_init(struct net *net)
3313{
3314 struct ctl_table *tbl;
3315
3316 tbl = ipv4_route_flush_table;
Octavian Purdila09ad9bc2009-11-25 15:14:13 -08003317 if (!net_eq(net, &init_net)) {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003318 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3319 if (tbl == NULL)
3320 goto err_dup;
3321 }
3322 tbl[0].extra1 = net;
3323
3324 net->ipv4.route_hdr =
3325 register_net_sysctl_table(net, ipv4_route_path, tbl);
3326 if (net->ipv4.route_hdr == NULL)
3327 goto err_reg;
3328 return 0;
3329
3330err_reg:
3331 if (tbl != ipv4_route_flush_table)
3332 kfree(tbl);
3333err_dup:
3334 return -ENOMEM;
3335}
3336
3337static __net_exit void sysctl_route_net_exit(struct net *net)
3338{
3339 struct ctl_table *tbl;
3340
3341 tbl = net->ipv4.route_hdr->ctl_table_arg;
3342 unregister_net_sysctl_table(net->ipv4.route_hdr);
3343 BUG_ON(tbl == ipv4_route_flush_table);
3344 kfree(tbl);
3345}
3346
3347static __net_initdata struct pernet_operations sysctl_route_ops = {
3348 .init = sysctl_route_net_init,
3349 .exit = sysctl_route_net_exit,
3350};
Linus Torvalds1da177e2005-04-16 15:20:36 -07003351#endif
3352
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003353
3354static __net_init int rt_secret_timer_init(struct net *net)
3355{
Denis V. Luneve84f84f2008-07-05 19:04:32 -07003356 atomic_set(&net->ipv4.rt_genid,
3357 (int) ((num_physpages ^ (num_physpages>>8)) ^
3358 (jiffies ^ (jiffies >> 7))));
3359
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003360 net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3361 net->ipv4.rt_secret_timer.data = (unsigned long)net;
3362 init_timer_deferrable(&net->ipv4.rt_secret_timer);
3363
Herbert Xuc6153b52008-08-15 13:44:31 -07003364 if (ip_rt_secret_interval) {
3365 net->ipv4.rt_secret_timer.expires =
3366 jiffies + net_random() % ip_rt_secret_interval +
3367 ip_rt_secret_interval;
3368 add_timer(&net->ipv4.rt_secret_timer);
3369 }
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003370 return 0;
3371}
3372
3373static __net_exit void rt_secret_timer_exit(struct net *net)
3374{
3375 del_timer_sync(&net->ipv4.rt_secret_timer);
3376}
3377
3378static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3379 .init = rt_secret_timer_init,
3380 .exit = rt_secret_timer_exit,
3381};
3382
3383
Linus Torvalds1da177e2005-04-16 15:20:36 -07003384#ifdef CONFIG_NET_CLS_ROUTE
Eric Dumazet8dbde282007-11-16 03:32:10 -08003385struct ip_rt_acct *ip_rt_acct __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003386#endif /* CONFIG_NET_CLS_ROUTE */
3387
3388static __initdata unsigned long rhash_entries;
3389static int __init set_rhash_entries(char *str)
3390{
3391 if (!str)
3392 return 0;
3393 rhash_entries = simple_strtoul(str, &str, 0);
3394 return 1;
3395}
3396__setup("rhash_entries=", set_rhash_entries);
3397
3398int __init ip_rt_init(void)
3399{
Eric Dumazet424c4b72005-07-05 14:58:19 -07003400 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003401
Linus Torvalds1da177e2005-04-16 15:20:36 -07003402#ifdef CONFIG_NET_CLS_ROUTE
Ingo Molnar0dcec8c2009-02-25 14:07:33 +01003403 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003404 if (!ip_rt_acct)
3405 panic("IP: failed to allocate ip_rt_acct\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003406#endif
3407
Alexey Dobriyane5d679f332006-08-26 19:25:52 -07003408 ipv4_dst_ops.kmem_cachep =
3409 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09003410 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003411
David S. Miller14e50e52007-05-24 18:17:54 -07003412 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3413
Eric Dumazet424c4b72005-07-05 14:58:19 -07003414 rt_hash_table = (struct rt_hash_bucket *)
3415 alloc_large_system_hash("IP route cache",
3416 sizeof(struct rt_hash_bucket),
3417 rhash_entries,
Jan Beulich44813742009-09-21 17:03:05 -07003418 (totalram_pages >= 128 * 1024) ?
Mike Stroyan18955cf2005-11-29 16:12:55 -08003419 15 : 17,
Kirill Korotaev8d1502d2006-08-07 20:44:22 -07003420 0,
Eric Dumazet424c4b72005-07-05 14:58:19 -07003421 &rt_hash_log,
3422 &rt_hash_mask,
Anton Blanchardc9503e02009-04-27 05:42:24 -07003423 rhash_entries ? 0 : 512 * 1024);
Eric Dumazet22c047c2005-07-05 14:55:24 -07003424 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3425 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003426
3427 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3428 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3429
Linus Torvalds1da177e2005-04-16 15:20:36 -07003430 devinet_init();
3431 ip_fib_init();
3432
Linus Torvalds1da177e2005-04-16 15:20:36 -07003433 /* All the timers, started at system startup tend
3434 to synchronize. Perturb it a bit.
3435 */
Eric Dumazet125bb8f2009-06-11 20:10:07 +00003436 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3437 expires_ljiffies = jiffies;
Eric Dumazet39c90ec2007-09-15 10:55:54 -07003438 schedule_delayed_work(&expires_work,
3439 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003440
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003441 if (register_pernet_subsys(&rt_secret_timer_ops))
3442 printk(KERN_ERR "Unable to setup rt_secret_timer\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003443
Denis V. Lunev73b38712008-02-28 20:51:18 -08003444 if (ip_rt_proc_init())
Pavel Emelyanov107f1632007-12-05 21:14:28 -08003445 printk(KERN_ERR "Unable to create route proc files\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003446#ifdef CONFIG_XFRM
3447 xfrm_init();
Neil Hormana33bc5c2009-07-30 18:52:15 -07003448 xfrm4_init(ip_rt_max_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003449#endif
Thomas Graf63f34442007-03-22 11:55:17 -07003450 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3451
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003452#ifdef CONFIG_SYSCTL
3453 register_pernet_subsys(&sysctl_route_ops);
3454#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07003455 return rc;
3456}
3457
Al Viroa1bc6eb2008-07-30 06:32:52 -04003458#ifdef CONFIG_SYSCTL
Al Viroeeb61f72008-07-27 08:59:33 +01003459/*
3460 * We really need to sanitize the damn ipv4 init order, then all
3461 * this nonsense will go away.
3462 */
3463void __init ip_static_sysctl_init(void)
3464{
Al Viro2f4520d2008-08-25 15:17:44 -07003465 register_sysctl_paths(ipv4_path, ipv4_skeleton);
Al Viroeeb61f72008-07-27 08:59:33 +01003466}
Al Viroa1bc6eb2008-07-30 06:32:52 -04003467#endif
Al Viroeeb61f72008-07-27 08:59:33 +01003468
Linus Torvalds1da177e2005-04-16 15:20:36 -07003469EXPORT_SYMBOL(__ip_select_ident);
3470EXPORT_SYMBOL(ip_route_input);
3471EXPORT_SYMBOL(ip_route_output_key);