blob: cb562fdd9b9a5342cee41f2cc642a8b4af9e30ce [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
Jesper Juhl02c30a82005-05-05 16:16:16 -07008 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -07009 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090021 * Alan Cox : Super /proc >4K
Linus Torvalds1da177e2005-04-16 15:20:36 -070022 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090039 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070055 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Ilia Sotnikovcef26852006-03-25 01:38:55 -080056 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
Linus Torvalds1da177e2005-04-16 15:20:36 -070058 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
Linus Torvalds1da177e2005-04-16 15:20:36 -070065#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070071#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070072#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070073#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
Eric Dumazet39c90ec2007-09-15 10:55:54 -070082#include <linux/workqueue.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070083#include <linux/skbuff.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070084#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090093#include <linux/slab.h>
Herbert Xu352e5122007-11-13 21:34:06 -080094#include <net/dst.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020095#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070096#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
Tom Tucker8d717402006-07-30 20:43:36 -0700106#include <net/netevent.h>
Thomas Graf63f34442007-03-22 11:55:17 -0700107#include <net/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108#ifdef CONFIG_SYSCTL
109#include <linux/sysctl.h>
110#endif
111
112#define RT_FL_TOS(oldflp) \
113 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114
115#define IP_MAX_MTU 0xFFF0
116
117#define RT_GC_TIMEOUT (300*HZ)
118
Linus Torvalds1da177e2005-04-16 15:20:36 -0700119static int ip_rt_max_size;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700120static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
121static int ip_rt_gc_interval __read_mostly = 60 * HZ;
122static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
123static int ip_rt_redirect_number __read_mostly = 9;
124static int ip_rt_redirect_load __read_mostly = HZ / 50;
125static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126static int ip_rt_error_cost __read_mostly = HZ;
127static int ip_rt_error_burst __read_mostly = 5 * HZ;
128static int ip_rt_gc_elasticity __read_mostly = 8;
129static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
130static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
131static int ip_rt_min_advmss __read_mostly = 256;
132static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
Neil Horman1080d702008-10-27 12:28:25 -0700133static int rt_chain_length_max __read_mostly = 20;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700134
Eric Dumazet125bb8f2009-06-11 20:10:07 +0000135static struct delayed_work expires_work;
136static unsigned long expires_ljiffies;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700137
138/*
139 * Interface to generic destination cache.
140 */
141
142static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
143static void ipv4_dst_destroy(struct dst_entry *dst);
144static void ipv4_dst_ifdown(struct dst_entry *dst,
145 struct net_device *dev, int how);
146static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
147static void ipv4_link_failure(struct sk_buff *skb);
148static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
Daniel Lezcano569d3642008-01-18 03:56:57 -0800149static int rt_garbage_collect(struct dst_ops *ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700150
151
152static struct dst_ops ipv4_dst_ops = {
153 .family = AF_INET,
Harvey Harrison09640e62009-02-01 00:45:17 -0800154 .protocol = cpu_to_be16(ETH_P_IP),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700155 .gc = rt_garbage_collect,
156 .check = ipv4_dst_check,
157 .destroy = ipv4_dst_destroy,
158 .ifdown = ipv4_dst_ifdown,
159 .negative_advice = ipv4_negative_advice,
160 .link_failure = ipv4_link_failure,
161 .update_pmtu = ip_rt_update_pmtu,
Herbert Xu1ac06e02008-05-20 14:32:14 -0700162 .local_out = __ip_local_out,
Eric Dumazete2422972008-01-30 20:07:45 -0800163 .entries = ATOMIC_INIT(0),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700164};
165
166#define ECN_OR_COST(class) TC_PRIO_##class
167
Philippe De Muyter4839c522007-07-09 15:32:57 -0700168const __u8 ip_tos2prio[16] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700169 TC_PRIO_BESTEFFORT,
170 ECN_OR_COST(FILLER),
171 TC_PRIO_BESTEFFORT,
172 ECN_OR_COST(BESTEFFORT),
173 TC_PRIO_BULK,
174 ECN_OR_COST(BULK),
175 TC_PRIO_BULK,
176 ECN_OR_COST(BULK),
177 TC_PRIO_INTERACTIVE,
178 ECN_OR_COST(INTERACTIVE),
179 TC_PRIO_INTERACTIVE,
180 ECN_OR_COST(INTERACTIVE),
181 TC_PRIO_INTERACTIVE_BULK,
182 ECN_OR_COST(INTERACTIVE_BULK),
183 TC_PRIO_INTERACTIVE_BULK,
184 ECN_OR_COST(INTERACTIVE_BULK)
185};
186
187
188/*
189 * Route cache.
190 */
191
192/* The locking scheme is rather straight forward:
193 *
194 * 1) Read-Copy Update protects the buckets of the central route hash.
195 * 2) Only writers remove entries, and they hold the lock
196 * as they look at rtable reference counts.
197 * 3) Only readers acquire references to rtable entries,
198 * they do so with atomic increments and with the
199 * lock held.
200 */
201
202struct rt_hash_bucket {
203 struct rtable *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700204};
Neil Horman1080d702008-10-27 12:28:25 -0700205
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700206#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
207 defined(CONFIG_PROVE_LOCKING)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700208/*
209 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210 * The size of this table is a power of two and depends on the number of CPUS.
Ingo Molnar62051202006-07-03 00:24:59 -0700211 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700212 */
Ingo Molnar62051202006-07-03 00:24:59 -0700213#ifdef CONFIG_LOCKDEP
214# define RT_HASH_LOCK_SZ 256
Eric Dumazet22c047c2005-07-05 14:55:24 -0700215#else
Ingo Molnar62051202006-07-03 00:24:59 -0700216# if NR_CPUS >= 32
217# define RT_HASH_LOCK_SZ 4096
218# elif NR_CPUS >= 16
219# define RT_HASH_LOCK_SZ 2048
220# elif NR_CPUS >= 8
221# define RT_HASH_LOCK_SZ 1024
222# elif NR_CPUS >= 4
223# define RT_HASH_LOCK_SZ 512
224# else
225# define RT_HASH_LOCK_SZ 256
226# endif
Eric Dumazet22c047c2005-07-05 14:55:24 -0700227#endif
228
229static spinlock_t *rt_hash_locks;
230# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800231
232static __init void rt_hash_lock_init(void)
233{
234 int i;
235
236 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
237 GFP_KERNEL);
238 if (!rt_hash_locks)
239 panic("IP: failed to allocate rt_hash_locks\n");
240
241 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
242 spin_lock_init(&rt_hash_locks[i]);
243}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700244#else
245# define rt_hash_lock_addr(slot) NULL
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800246
247static inline void rt_hash_lock_init(void)
248{
249}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700250#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700251
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700252static struct rt_hash_bucket *rt_hash_table __read_mostly;
253static unsigned rt_hash_mask __read_mostly;
254static unsigned int rt_hash_log __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700255
Eric Dumazet2f970d82006-01-17 02:54:36 -0800256static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
Andrew Mortondbd29152006-01-17 21:58:01 -0800257#define RT_CACHE_STAT_INC(field) \
Paul Mackerrasbfe5d832006-06-25 05:47:14 -0700258 (__raw_get_cpu_var(rt_cache_stat).field++)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700259
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700260static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
261 int genid)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700262{
Stephen Hemminger1294fc42008-04-10 01:54:01 -0700263 return jhash_3words((__force u32)(__be32)(daddr),
264 (__force u32)(__be32)(saddr),
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700265 idx, genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800266 & rt_hash_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700267}
268
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700269static inline int rt_genid(struct net *net)
270{
271 return atomic_read(&net->ipv4.rt_genid);
272}
273
Linus Torvalds1da177e2005-04-16 15:20:36 -0700274#ifdef CONFIG_PROC_FS
275struct rt_cache_iter_state {
Denis V. Luneva75e9362008-02-28 20:50:55 -0800276 struct seq_net_private p;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700277 int bucket;
Eric Dumazet29e75252008-01-31 17:05:09 -0800278 int genid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700279};
280
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900281static struct rtable *rt_cache_get_first(struct seq_file *seq)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700282{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900283 struct rt_cache_iter_state *st = seq->private;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700284 struct rtable *r = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700285
286 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
Eric Dumazeta6272662008-08-28 01:11:25 -0700287 if (!rt_hash_table[st->bucket].chain)
288 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700289 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -0800290 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Eric Dumazet29e75252008-01-31 17:05:09 -0800291 while (r) {
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900292 if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
Denis V. Luneva75e9362008-02-28 20:50:55 -0800293 r->rt_genid == st->genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800294 return r;
Paul E. McKenneya898def2010-02-22 17:04:49 -0800295 r = rcu_dereference_bh(r->u.dst.rt_next);
Eric Dumazet29e75252008-01-31 17:05:09 -0800296 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700297 rcu_read_unlock_bh();
298 }
Eric Dumazet29e75252008-01-31 17:05:09 -0800299 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700300}
301
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900302static struct rtable *__rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800303 struct rtable *r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700304{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900305 struct rt_cache_iter_state *st = seq->private;
Eric Dumazeta6272662008-08-28 01:11:25 -0700306
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800307 r = r->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700308 while (!r) {
309 rcu_read_unlock_bh();
Eric Dumazeta6272662008-08-28 01:11:25 -0700310 do {
311 if (--st->bucket < 0)
312 return NULL;
313 } while (!rt_hash_table[st->bucket].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700314 rcu_read_lock_bh();
315 r = rt_hash_table[st->bucket].chain;
316 }
Paul E. McKenneya898def2010-02-22 17:04:49 -0800317 return rcu_dereference_bh(r);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700318}
319
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900320static struct rtable *rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800321 struct rtable *r)
322{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900323 struct rt_cache_iter_state *st = seq->private;
324 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
325 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
Denis V. Luneva75e9362008-02-28 20:50:55 -0800326 continue;
Denis V. Lunev642d6312008-02-28 20:50:33 -0800327 if (r->rt_genid == st->genid)
328 break;
329 }
330 return r;
331}
332
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900333static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700334{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900335 struct rtable *r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700336
337 if (r)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900338 while (pos && (r = rt_cache_get_next(seq, r)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700339 --pos;
340 return pos ? NULL : r;
341}
342
343static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
344{
Eric Dumazet29e75252008-01-31 17:05:09 -0800345 struct rt_cache_iter_state *st = seq->private;
Eric Dumazet29e75252008-01-31 17:05:09 -0800346 if (*pos)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900347 return rt_cache_get_idx(seq, *pos - 1);
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700348 st->genid = rt_genid(seq_file_net(seq));
Eric Dumazet29e75252008-01-31 17:05:09 -0800349 return SEQ_START_TOKEN;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700350}
351
352static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
353{
Eric Dumazet29e75252008-01-31 17:05:09 -0800354 struct rtable *r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700355
356 if (v == SEQ_START_TOKEN)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900357 r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700358 else
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900359 r = rt_cache_get_next(seq, v);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700360 ++*pos;
361 return r;
362}
363
364static void rt_cache_seq_stop(struct seq_file *seq, void *v)
365{
366 if (v && v != SEQ_START_TOKEN)
367 rcu_read_unlock_bh();
368}
369
370static int rt_cache_seq_show(struct seq_file *seq, void *v)
371{
372 if (v == SEQ_START_TOKEN)
373 seq_printf(seq, "%-127s\n",
374 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
375 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
376 "HHUptod\tSpecDst");
377 else {
378 struct rtable *r = v;
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700379 int len;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700380
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700381 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
382 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700383 r->u.dst.dev ? r->u.dst.dev->name : "*",
384 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
385 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
386 r->u.dst.__use, 0, (unsigned long)r->rt_src,
387 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
388 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
389 dst_metric(&r->u.dst, RTAX_WINDOW),
390 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
391 dst_metric(&r->u.dst, RTAX_RTTVAR)),
392 r->fl.fl4_tos,
393 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
394 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
395 dev_queue_xmit) : 0,
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700396 r->rt_spec_dst, &len);
397
398 seq_printf(seq, "%*s\n", 127 - len, "");
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900399 }
400 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700401}
402
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700403static const struct seq_operations rt_cache_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700404 .start = rt_cache_seq_start,
405 .next = rt_cache_seq_next,
406 .stop = rt_cache_seq_stop,
407 .show = rt_cache_seq_show,
408};
409
410static int rt_cache_seq_open(struct inode *inode, struct file *file)
411{
Denis V. Luneva75e9362008-02-28 20:50:55 -0800412 return seq_open_net(inode, file, &rt_cache_seq_ops,
Pavel Emelyanovcf7732e2007-10-10 02:29:29 -0700413 sizeof(struct rt_cache_iter_state));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700414}
415
Arjan van de Ven9a321442007-02-12 00:55:35 -0800416static const struct file_operations rt_cache_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700417 .owner = THIS_MODULE,
418 .open = rt_cache_seq_open,
419 .read = seq_read,
420 .llseek = seq_lseek,
Denis V. Luneva75e9362008-02-28 20:50:55 -0800421 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700422};
423
424
425static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
426{
427 int cpu;
428
429 if (*pos == 0)
430 return SEQ_START_TOKEN;
431
Rusty Russell0f23174a2008-12-29 12:23:42 +0000432 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700433 if (!cpu_possible(cpu))
434 continue;
435 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800436 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700437 }
438 return NULL;
439}
440
441static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
442{
443 int cpu;
444
Rusty Russell0f23174a2008-12-29 12:23:42 +0000445 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700446 if (!cpu_possible(cpu))
447 continue;
448 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800449 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700450 }
451 return NULL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900452
Linus Torvalds1da177e2005-04-16 15:20:36 -0700453}
454
455static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
456{
457
458}
459
460static int rt_cpu_seq_show(struct seq_file *seq, void *v)
461{
462 struct rt_cache_stat *st = v;
463
464 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700465 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700466 return 0;
467 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900468
Linus Torvalds1da177e2005-04-16 15:20:36 -0700469 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
470 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
471 atomic_read(&ipv4_dst_ops.entries),
472 st->in_hit,
473 st->in_slow_tot,
474 st->in_slow_mc,
475 st->in_no_route,
476 st->in_brd,
477 st->in_martian_dst,
478 st->in_martian_src,
479
480 st->out_hit,
481 st->out_slow_tot,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900482 st->out_slow_mc,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700483
484 st->gc_total,
485 st->gc_ignored,
486 st->gc_goal_miss,
487 st->gc_dst_overflow,
488 st->in_hlist_search,
489 st->out_hlist_search
490 );
491 return 0;
492}
493
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700494static const struct seq_operations rt_cpu_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700495 .start = rt_cpu_seq_start,
496 .next = rt_cpu_seq_next,
497 .stop = rt_cpu_seq_stop,
498 .show = rt_cpu_seq_show,
499};
500
501
502static int rt_cpu_seq_open(struct inode *inode, struct file *file)
503{
504 return seq_open(file, &rt_cpu_seq_ops);
505}
506
Arjan van de Ven9a321442007-02-12 00:55:35 -0800507static const struct file_operations rt_cpu_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700508 .owner = THIS_MODULE,
509 .open = rt_cpu_seq_open,
510 .read = seq_read,
511 .llseek = seq_lseek,
512 .release = seq_release,
513};
514
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800515#ifdef CONFIG_NET_CLS_ROUTE
Alexey Dobriyana661c412009-11-25 15:40:35 -0800516static int rt_acct_proc_show(struct seq_file *m, void *v)
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800517{
Alexey Dobriyana661c412009-11-25 15:40:35 -0800518 struct ip_rt_acct *dst, *src;
519 unsigned int i, j;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800520
Alexey Dobriyana661c412009-11-25 15:40:35 -0800521 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
522 if (!dst)
523 return -ENOMEM;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800524
Alexey Dobriyana661c412009-11-25 15:40:35 -0800525 for_each_possible_cpu(i) {
526 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
527 for (j = 0; j < 256; j++) {
528 dst[j].o_bytes += src[j].o_bytes;
529 dst[j].o_packets += src[j].o_packets;
530 dst[j].i_bytes += src[j].i_bytes;
531 dst[j].i_packets += src[j].i_packets;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800532 }
533 }
Alexey Dobriyana661c412009-11-25 15:40:35 -0800534
535 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
536 kfree(dst);
537 return 0;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800538}
Alexey Dobriyana661c412009-11-25 15:40:35 -0800539
540static int rt_acct_proc_open(struct inode *inode, struct file *file)
541{
542 return single_open(file, rt_acct_proc_show, NULL);
543}
544
545static const struct file_operations rt_acct_proc_fops = {
546 .owner = THIS_MODULE,
547 .open = rt_acct_proc_open,
548 .read = seq_read,
549 .llseek = seq_lseek,
550 .release = single_release,
551};
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800552#endif
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800553
Denis V. Lunev73b38712008-02-28 20:51:18 -0800554static int __net_init ip_rt_do_proc_init(struct net *net)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800555{
556 struct proc_dir_entry *pde;
557
558 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
559 &rt_cache_seq_fops);
560 if (!pde)
561 goto err1;
562
Wang Chen77020722008-02-28 14:14:25 -0800563 pde = proc_create("rt_cache", S_IRUGO,
564 net->proc_net_stat, &rt_cpu_seq_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800565 if (!pde)
566 goto err2;
567
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800568#ifdef CONFIG_NET_CLS_ROUTE
Alexey Dobriyana661c412009-11-25 15:40:35 -0800569 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800570 if (!pde)
571 goto err3;
572#endif
573 return 0;
574
575#ifdef CONFIG_NET_CLS_ROUTE
576err3:
577 remove_proc_entry("rt_cache", net->proc_net_stat);
578#endif
579err2:
580 remove_proc_entry("rt_cache", net->proc_net);
581err1:
582 return -ENOMEM;
583}
Denis V. Lunev73b38712008-02-28 20:51:18 -0800584
585static void __net_exit ip_rt_do_proc_exit(struct net *net)
586{
587 remove_proc_entry("rt_cache", net->proc_net_stat);
588 remove_proc_entry("rt_cache", net->proc_net);
Alexey Dobriyan0a931ac2010-01-17 03:32:50 +0000589#ifdef CONFIG_NET_CLS_ROUTE
Denis V. Lunev73b38712008-02-28 20:51:18 -0800590 remove_proc_entry("rt_acct", net->proc_net);
Alexey Dobriyan0a931ac2010-01-17 03:32:50 +0000591#endif
Denis V. Lunev73b38712008-02-28 20:51:18 -0800592}
593
594static struct pernet_operations ip_rt_proc_ops __net_initdata = {
595 .init = ip_rt_do_proc_init,
596 .exit = ip_rt_do_proc_exit,
597};
598
599static int __init ip_rt_proc_init(void)
600{
601 return register_pernet_subsys(&ip_rt_proc_ops);
602}
603
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800604#else
Denis V. Lunev73b38712008-02-28 20:51:18 -0800605static inline int ip_rt_proc_init(void)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800606{
607 return 0;
608}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700609#endif /* CONFIG_PROC_FS */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900610
Stephen Hemminger5969f712008-04-10 01:52:09 -0700611static inline void rt_free(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700612{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700613 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
614}
615
Stephen Hemminger5969f712008-04-10 01:52:09 -0700616static inline void rt_drop(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700617{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700618 ip_rt_put(rt);
619 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
620}
621
Stephen Hemminger5969f712008-04-10 01:52:09 -0700622static inline int rt_fast_clean(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700623{
624 /* Kill broadcast/multicast entries very aggresively, if they
625 collide in hash table with more useful entries */
626 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800627 rth->fl.iif && rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700628}
629
Stephen Hemminger5969f712008-04-10 01:52:09 -0700630static inline int rt_valuable(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700631{
632 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
633 rth->u.dst.expires;
634}
635
636static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
637{
638 unsigned long age;
639 int ret = 0;
640
641 if (atomic_read(&rth->u.dst.__refcnt))
642 goto out;
643
644 ret = 1;
645 if (rth->u.dst.expires &&
646 time_after_eq(jiffies, rth->u.dst.expires))
647 goto out;
648
649 age = jiffies - rth->u.dst.lastuse;
650 ret = 0;
651 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
652 (age <= tmo2 && rt_valuable(rth)))
653 goto out;
654 ret = 1;
655out: return ret;
656}
657
658/* Bits of score are:
659 * 31: very valuable
660 * 30: not quite useless
661 * 29..0: usage counter
662 */
663static inline u32 rt_score(struct rtable *rt)
664{
665 u32 score = jiffies - rt->u.dst.lastuse;
666
667 score = ~score & ~(3<<30);
668
669 if (rt_valuable(rt))
670 score |= (1<<31);
671
672 if (!rt->fl.iif ||
673 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
674 score |= (1<<30);
675
676 return score;
677}
678
Neil Horman1080d702008-10-27 12:28:25 -0700679static inline bool rt_caching(const struct net *net)
680{
681 return net->ipv4.current_rt_cache_rebuild_count <=
682 net->ipv4.sysctl_rt_cache_rebuild_count;
683}
684
685static inline bool compare_hash_inputs(const struct flowi *fl1,
686 const struct flowi *fl2)
687{
688 return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
689 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
690 (fl1->iif ^ fl2->iif)) == 0);
691}
692
Linus Torvalds1da177e2005-04-16 15:20:36 -0700693static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
694{
Al Viro714e85b2006-11-14 20:51:49 -0800695 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
696 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
Thomas Graf47dcf0c2006-11-09 15:20:38 -0800697 (fl1->mark ^ fl2->mark) |
David S. Miller8238b212006-10-12 00:49:15 -0700698 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
699 *(u16 *)&fl2->nl_u.ip4_u.tos) |
700 (fl1->oif ^ fl2->oif) |
701 (fl1->iif ^ fl2->iif)) == 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700702}
703
Denis V. Lunevb5921912008-01-22 23:50:25 -0800704static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
705{
Octavian Purdila09ad9bc2009-11-25 15:14:13 -0800706 return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev));
Denis V. Lunevb5921912008-01-22 23:50:25 -0800707}
708
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700709static inline int rt_is_expired(struct rtable *rth)
710{
711 return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
712}
713
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800714/*
715 * Perform a full scan of hash table and free all entries.
716 * Can be called by a softirq or a process.
717 * In the later case, we want to be reschedule if necessary
718 */
719static void rt_do_flush(int process_context)
720{
721 unsigned int i;
722 struct rtable *rth, *next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700723 struct rtable * tail;
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800724
725 for (i = 0; i <= rt_hash_mask; i++) {
726 if (process_context && need_resched())
727 cond_resched();
728 rth = rt_hash_table[i].chain;
729 if (!rth)
730 continue;
731
732 spin_lock_bh(rt_hash_lock_addr(i));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700733#ifdef CONFIG_NET_NS
734 {
735 struct rtable ** prev, * p;
736
737 rth = rt_hash_table[i].chain;
738
739 /* defer releasing the head of the list after spin_unlock */
740 for (tail = rth; tail; tail = tail->u.dst.rt_next)
741 if (!rt_is_expired(tail))
742 break;
743 if (rth != tail)
744 rt_hash_table[i].chain = tail;
745
746 /* call rt_free on entries after the tail requiring flush */
747 prev = &rt_hash_table[i].chain;
748 for (p = *prev; p; p = next) {
749 next = p->u.dst.rt_next;
750 if (!rt_is_expired(p)) {
751 prev = &p->u.dst.rt_next;
752 } else {
753 *prev = next;
754 rt_free(p);
755 }
756 }
757 }
758#else
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800759 rth = rt_hash_table[i].chain;
760 rt_hash_table[i].chain = NULL;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700761 tail = NULL;
762#endif
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800763 spin_unlock_bh(rt_hash_lock_addr(i));
764
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700765 for (; rth != tail; rth = next) {
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800766 next = rth->u.dst.rt_next;
767 rt_free(rth);
768 }
769 }
770}
771
Neil Horman1080d702008-10-27 12:28:25 -0700772/*
773 * While freeing expired entries, we compute average chain length
774 * and standard deviation, using fixed-point arithmetic.
775 * This to have an estimation of rt_chain_length_max
776 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
777 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
778 */
779
780#define FRACT_BITS 3
781#define ONE (1UL << FRACT_BITS)
782
Eric Dumazet98376382010-03-08 03:20:00 +0000783/*
784 * Given a hash chain and an item in this hash chain,
785 * find if a previous entry has the same hash_inputs
786 * (but differs on tos, mark or oif)
787 * Returns 0 if an alias is found.
788 * Returns ONE if rth has no alias before itself.
789 */
790static int has_noalias(const struct rtable *head, const struct rtable *rth)
791{
792 const struct rtable *aux = head;
793
794 while (aux != rth) {
795 if (compare_hash_inputs(&aux->fl, &rth->fl))
796 return 0;
797 aux = aux->u.dst.rt_next;
798 }
799 return ONE;
800}
801
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800802static void rt_check_expire(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700803{
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700804 static unsigned int rover;
805 unsigned int i = rover, goal;
Eric Dumazet98376382010-03-08 03:20:00 +0000806 struct rtable *rth, **rthp;
Eric Dumazetcf8da762009-05-19 18:54:22 +0000807 unsigned long samples = 0;
Neil Horman1080d702008-10-27 12:28:25 -0700808 unsigned long sum = 0, sum2 = 0;
Eric Dumazet125bb8f2009-06-11 20:10:07 +0000809 unsigned long delta;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700810 u64 mult;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700811
Eric Dumazet125bb8f2009-06-11 20:10:07 +0000812 delta = jiffies - expires_ljiffies;
813 expires_ljiffies = jiffies;
814 mult = ((u64)delta) << rt_hash_log;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700815 if (ip_rt_gc_timeout > 1)
816 do_div(mult, ip_rt_gc_timeout);
817 goal = (unsigned int)mult;
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700818 if (goal > rt_hash_mask)
819 goal = rt_hash_mask + 1;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700820 for (; goal > 0; goal--) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700821 unsigned long tmo = ip_rt_gc_timeout;
Eric Dumazetcf8da762009-05-19 18:54:22 +0000822 unsigned long length;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700823
824 i = (i + 1) & rt_hash_mask;
825 rthp = &rt_hash_table[i].chain;
826
Eric Dumazetd90bf5a2007-11-14 16:14:05 -0800827 if (need_resched())
828 cond_resched();
829
Neil Horman1080d702008-10-27 12:28:25 -0700830 samples++;
831
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700832 if (*rthp == NULL)
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700833 continue;
Eric Dumazetcf8da762009-05-19 18:54:22 +0000834 length = 0;
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700835 spin_lock_bh(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700836 while ((rth = *rthp) != NULL) {
Eric Dumazet1ddbcb02009-05-19 20:14:28 +0000837 prefetch(rth->u.dst.rt_next);
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700838 if (rt_is_expired(rth)) {
Eric Dumazet29e75252008-01-31 17:05:09 -0800839 *rthp = rth->u.dst.rt_next;
840 rt_free(rth);
841 continue;
842 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700843 if (rth->u.dst.expires) {
844 /* Entry is expired even if it is in use */
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700845 if (time_before_eq(jiffies, rth->u.dst.expires)) {
Eric Dumazet1ddbcb02009-05-19 20:14:28 +0000846nofree:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700847 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800848 rthp = &rth->u.dst.rt_next;
Neil Horman1080d702008-10-27 12:28:25 -0700849 /*
Eric Dumazet1ddbcb02009-05-19 20:14:28 +0000850 * We only count entries on
Neil Horman1080d702008-10-27 12:28:25 -0700851 * a chain with equal hash inputs once
852 * so that entries for different QOS
853 * levels, and other non-hash input
854 * attributes don't unfairly skew
855 * the length computation
856 */
Eric Dumazet98376382010-03-08 03:20:00 +0000857 length += has_noalias(rt_hash_table[i].chain, rth);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700858 continue;
859 }
Eric Dumazet1ddbcb02009-05-19 20:14:28 +0000860 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
861 goto nofree;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700862
863 /* Cleanup aged off entries. */
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800864 *rthp = rth->u.dst.rt_next;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900865 rt_free(rth);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700866 }
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700867 spin_unlock_bh(rt_hash_lock_addr(i));
Neil Horman1080d702008-10-27 12:28:25 -0700868 sum += length;
869 sum2 += length*length;
870 }
871 if (samples) {
872 unsigned long avg = sum / samples;
873 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
874 rt_chain_length_max = max_t(unsigned long,
875 ip_rt_gc_elasticity,
876 (avg + 4*sd) >> FRACT_BITS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700877 }
878 rover = i;
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800879}
880
881/*
882 * rt_worker_func() is run in process context.
Eric Dumazet29e75252008-01-31 17:05:09 -0800883 * we call rt_check_expire() to scan part of the hash table
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800884 */
885static void rt_worker_func(struct work_struct *work)
886{
Eric Dumazet29e75252008-01-31 17:05:09 -0800887 rt_check_expire();
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700888 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700889}
890
Eric Dumazet29e75252008-01-31 17:05:09 -0800891/*
892 * Pertubation of rt_genid by a small quantity [1..256]
893 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
894 * many times (2^24) without giving recent rt_genid.
895 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700896 */
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700897static void rt_cache_invalidate(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700898{
Eric Dumazet29e75252008-01-31 17:05:09 -0800899 unsigned char shuffle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700900
Eric Dumazet29e75252008-01-31 17:05:09 -0800901 get_random_bytes(&shuffle, sizeof(shuffle));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700902 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700903}
904
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800905/*
Eric Dumazet29e75252008-01-31 17:05:09 -0800906 * delay < 0 : invalidate cache (fast : entries will be deleted later)
907 * delay >= 0 : invalidate & flush cache (can be long)
908 */
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -0700909void rt_cache_flush(struct net *net, int delay)
Eric Dumazet29e75252008-01-31 17:05:09 -0800910{
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700911 rt_cache_invalidate(net);
Eric Dumazet29e75252008-01-31 17:05:09 -0800912 if (delay >= 0)
913 rt_do_flush(!in_softirq());
914}
915
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000916/* Flush previous cache invalidated entries from the cache */
917void rt_cache_flush_batch(void)
918{
919 rt_do_flush(!in_softirq());
920}
921
Eric Dumazet29e75252008-01-31 17:05:09 -0800922/*
923 * We change rt_genid and let gc do the cleanup
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800924 */
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -0700925static void rt_secret_rebuild(unsigned long __net)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700926{
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -0700927 struct net *net = (struct net *)__net;
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700928 rt_cache_invalidate(net);
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -0700929 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700930}
931
Neil Horman1080d702008-10-27 12:28:25 -0700932static void rt_secret_rebuild_oneshot(struct net *net)
933{
934 del_timer_sync(&net->ipv4.rt_secret_timer);
935 rt_cache_invalidate(net);
Vitaliy Gusev858a18a2010-03-16 01:07:51 +0000936 if (ip_rt_secret_interval)
937 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
Neil Horman1080d702008-10-27 12:28:25 -0700938}
939
940static void rt_emergency_hash_rebuild(struct net *net)
941{
942 if (net_ratelimit()) {
943 printk(KERN_WARNING "Route hash chain too long!\n");
944 printk(KERN_WARNING "Adjust your secret_interval!\n");
945 }
946
947 rt_secret_rebuild_oneshot(net);
948}
949
Linus Torvalds1da177e2005-04-16 15:20:36 -0700950/*
951 Short description of GC goals.
952
953 We want to build algorithm, which will keep routing cache
954 at some equilibrium point, when number of aged off entries
955 is kept approximately equal to newly generated ones.
956
957 Current expiration strength is variable "expire".
958 We try to adjust it dynamically, so that if networking
959 is idle expires is large enough to keep enough of warm entries,
960 and when load increases it reduces to limit cache size.
961 */
962
Daniel Lezcano569d3642008-01-18 03:56:57 -0800963static int rt_garbage_collect(struct dst_ops *ops)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700964{
965 static unsigned long expire = RT_GC_TIMEOUT;
966 static unsigned long last_gc;
967 static int rover;
968 static int equilibrium;
969 struct rtable *rth, **rthp;
970 unsigned long now = jiffies;
971 int goal;
972
973 /*
974 * Garbage collection is pretty expensive,
975 * do not make it too frequently.
976 */
977
978 RT_CACHE_STAT_INC(gc_total);
979
980 if (now - last_gc < ip_rt_gc_min_interval &&
981 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
982 RT_CACHE_STAT_INC(gc_ignored);
983 goto out;
984 }
985
986 /* Calculate number of entries, which we want to expire now. */
987 goal = atomic_read(&ipv4_dst_ops.entries) -
988 (ip_rt_gc_elasticity << rt_hash_log);
989 if (goal <= 0) {
990 if (equilibrium < ipv4_dst_ops.gc_thresh)
991 equilibrium = ipv4_dst_ops.gc_thresh;
992 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
993 if (goal > 0) {
Eric Dumazetb790ced2007-12-21 01:49:07 -0800994 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700995 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
996 }
997 } else {
998 /* We are in dangerous area. Try to reduce cache really
999 * aggressively.
1000 */
Eric Dumazetb790ced2007-12-21 01:49:07 -08001001 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001002 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
1003 }
1004
1005 if (now - last_gc >= ip_rt_gc_min_interval)
1006 last_gc = now;
1007
1008 if (goal <= 0) {
1009 equilibrium += goal;
1010 goto work_done;
1011 }
1012
1013 do {
1014 int i, k;
1015
1016 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1017 unsigned long tmo = expire;
1018
1019 k = (k + 1) & rt_hash_mask;
1020 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001021 spin_lock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001022 while ((rth = *rthp) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001023 if (!rt_is_expired(rth) &&
Eric Dumazet29e75252008-01-31 17:05:09 -08001024 !rt_may_expire(rth, tmo, expire)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001025 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001026 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001027 continue;
1028 }
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001029 *rthp = rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001030 rt_free(rth);
1031 goal--;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001032 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001033 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001034 if (goal <= 0)
1035 break;
1036 }
1037 rover = k;
1038
1039 if (goal <= 0)
1040 goto work_done;
1041
1042 /* Goal is not achieved. We stop process if:
1043
1044 - if expire reduced to zero. Otherwise, expire is halfed.
1045 - if table is not full.
1046 - if we are called from interrupt.
1047 - jiffies check is just fallback/debug loop breaker.
1048 We will not spin here for long time in any case.
1049 */
1050
1051 RT_CACHE_STAT_INC(gc_goal_miss);
1052
1053 if (expire == 0)
1054 break;
1055
1056 expire >>= 1;
1057#if RT_CACHE_DEBUG >= 2
1058 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1059 atomic_read(&ipv4_dst_ops.entries), goal, i);
1060#endif
1061
1062 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1063 goto out;
1064 } while (!in_softirq() && time_before_eq(jiffies, now));
1065
1066 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1067 goto out;
1068 if (net_ratelimit())
1069 printk(KERN_WARNING "dst cache overflow\n");
1070 RT_CACHE_STAT_INC(gc_dst_overflow);
1071 return 1;
1072
1073work_done:
1074 expire += ip_rt_gc_min_interval;
1075 if (expire > ip_rt_gc_timeout ||
1076 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1077 expire = ip_rt_gc_timeout;
1078#if RT_CACHE_DEBUG >= 2
1079 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1080 atomic_read(&ipv4_dst_ops.entries), goal, rover);
1081#endif
1082out: return 0;
1083}
1084
Eric Dumazet98376382010-03-08 03:20:00 +00001085/*
1086 * Returns number of entries in a hash chain that have different hash_inputs
1087 */
1088static int slow_chain_length(const struct rtable *head)
1089{
1090 int length = 0;
1091 const struct rtable *rth = head;
1092
1093 while (rth) {
1094 length += has_noalias(head, rth);
1095 rth = rth->u.dst.rt_next;
1096 }
1097 return length >> FRACT_BITS;
1098}
1099
Eric Dumazet511c3f92009-06-02 05:14:27 +00001100static int rt_intern_hash(unsigned hash, struct rtable *rt,
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001101 struct rtable **rp, struct sk_buff *skb, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001102{
1103 struct rtable *rth, **rthp;
1104 unsigned long now;
1105 struct rtable *cand, **candp;
1106 u32 min_score;
1107 int chain_length;
1108 int attempts = !in_softirq();
1109
1110restart:
1111 chain_length = 0;
1112 min_score = ~(u32)0;
1113 cand = NULL;
1114 candp = NULL;
1115 now = jiffies;
1116
Neil Horman1080d702008-10-27 12:28:25 -07001117 if (!rt_caching(dev_net(rt->u.dst.dev))) {
Neil Horman73e42892009-06-20 01:15:16 -07001118 /*
1119 * If we're not caching, just tell the caller we
1120 * were successful and don't touch the route. The
1121 * caller hold the sole reference to the cache entry, and
1122 * it will be released when the caller is done with it.
1123 * If we drop it here, the callers have no way to resolve routes
1124 * when we're not caching. Instead, just point *rp at rt, so
1125 * the caller gets a single use out of the route
Neil Hormanb6280b42009-06-22 10:18:53 +00001126 * Note that we do rt_free on this new route entry, so that
1127 * once its refcount hits zero, we are still able to reap it
1128 * (Thanks Alexey)
1129 * Note also the rt_free uses call_rcu. We don't actually
1130 * need rcu protection here, this is just our path to get
1131 * on the route gc list.
Neil Horman73e42892009-06-20 01:15:16 -07001132 */
Neil Hormanb6280b42009-06-22 10:18:53 +00001133
1134 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1135 int err = arp_bind_neighbour(&rt->u.dst);
1136 if (err) {
1137 if (net_ratelimit())
1138 printk(KERN_WARNING
1139 "Neighbour table failure & not caching routes.\n");
1140 rt_drop(rt);
1141 return err;
1142 }
1143 }
1144
1145 rt_free(rt);
1146 goto skip_hashing;
Neil Horman1080d702008-10-27 12:28:25 -07001147 }
1148
Linus Torvalds1da177e2005-04-16 15:20:36 -07001149 rthp = &rt_hash_table[hash].chain;
1150
Eric Dumazet22c047c2005-07-05 14:55:24 -07001151 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001152 while ((rth = *rthp) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001153 if (rt_is_expired(rth)) {
Eric Dumazet29e75252008-01-31 17:05:09 -08001154 *rthp = rth->u.dst.rt_next;
1155 rt_free(rth);
1156 continue;
1157 }
Denis V. Lunevb5921912008-01-22 23:50:25 -08001158 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001159 /* Put it first */
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001160 *rthp = rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001161 /*
1162 * Since lookup is lockfree, the deletion
1163 * must be visible to another weakly ordered CPU before
1164 * the insertion at the start of the hash chain.
1165 */
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001166 rcu_assign_pointer(rth->u.dst.rt_next,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001167 rt_hash_table[hash].chain);
1168 /*
1169 * Since lookup is lockfree, the update writes
1170 * must be ordered for consistency on SMP.
1171 */
1172 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1173
Pavel Emelyanov03f49f32007-11-10 21:28:34 -08001174 dst_use(&rth->u.dst, now);
Eric Dumazet22c047c2005-07-05 14:55:24 -07001175 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001176
1177 rt_drop(rt);
Eric Dumazet511c3f92009-06-02 05:14:27 +00001178 if (rp)
1179 *rp = rth;
1180 else
Eric Dumazetadf30902009-06-02 05:19:30 +00001181 skb_dst_set(skb, &rth->u.dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001182 return 0;
1183 }
1184
1185 if (!atomic_read(&rth->u.dst.__refcnt)) {
1186 u32 score = rt_score(rth);
1187
1188 if (score <= min_score) {
1189 cand = rth;
1190 candp = rthp;
1191 min_score = score;
1192 }
1193 }
1194
1195 chain_length++;
1196
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001197 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001198 }
1199
1200 if (cand) {
1201 /* ip_rt_gc_elasticity used to be average length of chain
1202 * length, when exceeded gc becomes really aggressive.
1203 *
1204 * The second limit is less certain. At the moment it allows
1205 * only 2 entries per bucket. We will see.
1206 */
1207 if (chain_length > ip_rt_gc_elasticity) {
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001208 *candp = cand->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001209 rt_free(cand);
1210 }
Neil Horman1080d702008-10-27 12:28:25 -07001211 } else {
Eric Dumazet98376382010-03-08 03:20:00 +00001212 if (chain_length > rt_chain_length_max &&
1213 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
Neil Horman1080d702008-10-27 12:28:25 -07001214 struct net *net = dev_net(rt->u.dst.dev);
1215 int num = ++net->ipv4.current_rt_cache_rebuild_count;
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001216 if (!rt_caching(net)) {
Neil Horman1080d702008-10-27 12:28:25 -07001217 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1218 rt->u.dst.dev->name, num);
1219 }
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001220 rt_emergency_hash_rebuild(net);
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001221 spin_unlock_bh(rt_hash_lock_addr(hash));
1222
1223 hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1224 ifindex, rt_genid(net));
1225 goto restart;
Neil Horman1080d702008-10-27 12:28:25 -07001226 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001227 }
1228
1229 /* Try to bind route to arp only if it is output
1230 route or unicast forwarding path.
1231 */
1232 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1233 int err = arp_bind_neighbour(&rt->u.dst);
1234 if (err) {
Eric Dumazet22c047c2005-07-05 14:55:24 -07001235 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001236
1237 if (err != -ENOBUFS) {
1238 rt_drop(rt);
1239 return err;
1240 }
1241
1242 /* Neighbour tables are full and nothing
1243 can be released. Try to shrink route cache,
1244 it is most likely it holds some neighbour records.
1245 */
1246 if (attempts-- > 0) {
1247 int saved_elasticity = ip_rt_gc_elasticity;
1248 int saved_int = ip_rt_gc_min_interval;
1249 ip_rt_gc_elasticity = 1;
1250 ip_rt_gc_min_interval = 0;
Daniel Lezcano569d3642008-01-18 03:56:57 -08001251 rt_garbage_collect(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001252 ip_rt_gc_min_interval = saved_int;
1253 ip_rt_gc_elasticity = saved_elasticity;
1254 goto restart;
1255 }
1256
1257 if (net_ratelimit())
1258 printk(KERN_WARNING "Neighbour table overflow.\n");
1259 rt_drop(rt);
1260 return -ENOBUFS;
1261 }
1262 }
1263
Eric Dumazet1ddbcb02009-05-19 20:14:28 +00001264 rt->u.dst.rt_next = rt_hash_table[hash].chain;
Neil Horman1080d702008-10-27 12:28:25 -07001265
Linus Torvalds1da177e2005-04-16 15:20:36 -07001266#if RT_CACHE_DEBUG >= 2
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001267 if (rt->u.dst.rt_next) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001268 struct rtable *trt;
Neil Hormanb6280b42009-06-22 10:18:53 +00001269 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1270 hash, &rt->rt_dst);
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001271 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
Harvey Harrison673d57e2008-10-31 00:53:57 -07001272 printk(" . %pI4", &trt->rt_dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001273 printk("\n");
1274 }
1275#endif
Eric Dumazet00269b52008-10-16 14:18:29 -07001276 /*
1277 * Since lookup is lockfree, we must make sure
1278 * previous writes to rt are comitted to memory
1279 * before making rt visible to other CPUS.
1280 */
Eric Dumazet1ddbcb02009-05-19 20:14:28 +00001281 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
Neil Horman1080d702008-10-27 12:28:25 -07001282
Eric Dumazet22c047c2005-07-05 14:55:24 -07001283 spin_unlock_bh(rt_hash_lock_addr(hash));
Neil Horman73e42892009-06-20 01:15:16 -07001284
Neil Hormanb6280b42009-06-22 10:18:53 +00001285skip_hashing:
Eric Dumazet511c3f92009-06-02 05:14:27 +00001286 if (rp)
1287 *rp = rt;
1288 else
Eric Dumazetadf30902009-06-02 05:19:30 +00001289 skb_dst_set(skb, &rt->u.dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001290 return 0;
1291}
1292
1293void rt_bind_peer(struct rtable *rt, int create)
1294{
1295 static DEFINE_SPINLOCK(rt_peer_lock);
1296 struct inet_peer *peer;
1297
1298 peer = inet_getpeer(rt->rt_dst, create);
1299
1300 spin_lock_bh(&rt_peer_lock);
1301 if (rt->peer == NULL) {
1302 rt->peer = peer;
1303 peer = NULL;
1304 }
1305 spin_unlock_bh(&rt_peer_lock);
1306 if (peer)
1307 inet_putpeer(peer);
1308}
1309
1310/*
1311 * Peer allocation may fail only in serious out-of-memory conditions. However
1312 * we still can generate some output.
1313 * Random ID selection looks a bit dangerous because we have no chances to
1314 * select ID being unique in a reasonable period of time.
1315 * But broken packet identifier may be better than no packet at all.
1316 */
1317static void ip_select_fb_ident(struct iphdr *iph)
1318{
1319 static DEFINE_SPINLOCK(ip_fb_id_lock);
1320 static u32 ip_fallback_id;
1321 u32 salt;
1322
1323 spin_lock_bh(&ip_fb_id_lock);
Al Viroe4485152006-09-26 22:15:01 -07001324 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001325 iph->id = htons(salt & 0xFFFF);
1326 ip_fallback_id = salt;
1327 spin_unlock_bh(&ip_fb_id_lock);
1328}
1329
1330void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1331{
1332 struct rtable *rt = (struct rtable *) dst;
1333
1334 if (rt) {
1335 if (rt->peer == NULL)
1336 rt_bind_peer(rt, 1);
1337
1338 /* If peer is attached to destination, it is never detached,
1339 so that we need not to grab a lock to dereference it.
1340 */
1341 if (rt->peer) {
1342 iph->id = htons(inet_getid(rt->peer, more));
1343 return;
1344 }
1345 } else
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001346 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
Stephen Hemminger9c2b3322005-04-19 22:39:42 -07001347 __builtin_return_address(0));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001348
1349 ip_select_fb_ident(iph);
1350}
1351
1352static void rt_del(unsigned hash, struct rtable *rt)
1353{
Eric Dumazet29e75252008-01-31 17:05:09 -08001354 struct rtable **rthp, *aux;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001355
Eric Dumazet29e75252008-01-31 17:05:09 -08001356 rthp = &rt_hash_table[hash].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001357 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001358 ip_rt_put(rt);
Eric Dumazet29e75252008-01-31 17:05:09 -08001359 while ((aux = *rthp) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001360 if (aux == rt || rt_is_expired(aux)) {
Eric Dumazet29e75252008-01-31 17:05:09 -08001361 *rthp = aux->u.dst.rt_next;
1362 rt_free(aux);
1363 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001364 }
Eric Dumazet29e75252008-01-31 17:05:09 -08001365 rthp = &aux->u.dst.rt_next;
1366 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001367 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001368}
1369
Al Virof7655222006-09-26 21:25:43 -07001370void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1371 __be32 saddr, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001372{
1373 int i, k;
1374 struct in_device *in_dev = in_dev_get(dev);
1375 struct rtable *rth, **rthp;
Al Virof7655222006-09-26 21:25:43 -07001376 __be32 skeys[2] = { saddr, 0 };
Linus Torvalds1da177e2005-04-16 15:20:36 -07001377 int ikeys[2] = { dev->ifindex, 0 };
Tom Tucker8d717402006-07-30 20:43:36 -07001378 struct netevent_redirect netevent;
Denis V. Lunev317805b2008-02-28 20:50:06 -08001379 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001380
Linus Torvalds1da177e2005-04-16 15:20:36 -07001381 if (!in_dev)
1382 return;
1383
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001384 net = dev_net(dev);
Joe Perches9d4fb272009-11-23 10:41:23 -08001385 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1386 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1387 ipv4_is_zeronet(new_gw))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001388 goto reject_redirect;
1389
Neil Horman1080d702008-10-27 12:28:25 -07001390 if (!rt_caching(net))
1391 goto reject_redirect;
1392
Linus Torvalds1da177e2005-04-16 15:20:36 -07001393 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1394 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1395 goto reject_redirect;
1396 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1397 goto reject_redirect;
1398 } else {
Denis V. Lunev317805b2008-02-28 20:50:06 -08001399 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001400 goto reject_redirect;
1401 }
1402
1403 for (i = 0; i < 2; i++) {
1404 for (k = 0; k < 2; k++) {
Denis V. Lunevb00180d2008-07-05 19:04:09 -07001405 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001406 rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001407
1408 rthp=&rt_hash_table[hash].chain;
1409
1410 rcu_read_lock();
1411 while ((rth = rcu_dereference(*rthp)) != NULL) {
1412 struct rtable *rt;
1413
1414 if (rth->fl.fl4_dst != daddr ||
1415 rth->fl.fl4_src != skeys[i] ||
Linus Torvalds1da177e2005-04-16 15:20:36 -07001416 rth->fl.oif != ikeys[k] ||
Eric Dumazet29e75252008-01-31 17:05:09 -08001417 rth->fl.iif != 0 ||
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001418 rt_is_expired(rth) ||
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09001419 !net_eq(dev_net(rth->u.dst.dev), net)) {
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001420 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001421 continue;
1422 }
1423
1424 if (rth->rt_dst != daddr ||
1425 rth->rt_src != saddr ||
1426 rth->u.dst.error ||
1427 rth->rt_gateway != old_gw ||
1428 rth->u.dst.dev != dev)
1429 break;
1430
1431 dst_hold(&rth->u.dst);
1432 rcu_read_unlock();
1433
1434 rt = dst_alloc(&ipv4_dst_ops);
1435 if (rt == NULL) {
1436 ip_rt_put(rth);
1437 in_dev_put(in_dev);
1438 return;
1439 }
1440
1441 /* Copy all the information. */
1442 *rt = *rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001443 rt->u.dst.__use = 1;
1444 atomic_set(&rt->u.dst.__refcnt, 1);
1445 rt->u.dst.child = NULL;
1446 if (rt->u.dst.dev)
1447 dev_hold(rt->u.dst.dev);
1448 if (rt->idev)
1449 in_dev_hold(rt->idev);
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001450 rt->u.dst.obsolete = -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001451 rt->u.dst.lastuse = jiffies;
1452 rt->u.dst.path = &rt->u.dst;
1453 rt->u.dst.neighbour = NULL;
1454 rt->u.dst.hh = NULL;
Alexey Dobriyandef8b4f2008-10-28 13:24:06 -07001455#ifdef CONFIG_XFRM
Linus Torvalds1da177e2005-04-16 15:20:36 -07001456 rt->u.dst.xfrm = NULL;
Alexey Dobriyandef8b4f2008-10-28 13:24:06 -07001457#endif
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001458 rt->rt_genid = rt_genid(net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001459 rt->rt_flags |= RTCF_REDIRECTED;
1460
1461 /* Gateway is different ... */
1462 rt->rt_gateway = new_gw;
1463
1464 /* Redirect received -> path was valid */
1465 dst_confirm(&rth->u.dst);
1466
1467 if (rt->peer)
1468 atomic_inc(&rt->peer->refcnt);
1469
1470 if (arp_bind_neighbour(&rt->u.dst) ||
1471 !(rt->u.dst.neighbour->nud_state &
1472 NUD_VALID)) {
1473 if (rt->u.dst.neighbour)
1474 neigh_event_send(rt->u.dst.neighbour, NULL);
1475 ip_rt_put(rth);
1476 rt_drop(rt);
1477 goto do_next;
1478 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001479
Tom Tucker8d717402006-07-30 20:43:36 -07001480 netevent.old = &rth->u.dst;
1481 netevent.new = &rt->u.dst;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001482 call_netevent_notifiers(NETEVENT_REDIRECT,
1483 &netevent);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001484
1485 rt_del(hash, rth);
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001486 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001487 ip_rt_put(rt);
1488 goto do_next;
1489 }
1490 rcu_read_unlock();
1491 do_next:
1492 ;
1493 }
1494 }
1495 in_dev_put(in_dev);
1496 return;
1497
1498reject_redirect:
1499#ifdef CONFIG_IP_ROUTE_VERBOSE
1500 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07001501 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1502 " Advised path = %pI4 -> %pI4\n",
1503 &old_gw, dev->name, &new_gw,
1504 &saddr, &daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001505#endif
1506 in_dev_put(in_dev);
1507}
1508
1509static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1510{
Eric Dumazetee6b9672008-03-05 18:30:47 -08001511 struct rtable *rt = (struct rtable *)dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001512 struct dst_entry *ret = dst;
1513
1514 if (rt) {
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001515 if (dst->obsolete > 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001516 ip_rt_put(rt);
1517 ret = NULL;
1518 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
Guenter Roeck5e016cb2010-03-21 20:55:13 -07001519 (rt->u.dst.expires &&
1520 time_after_eq(jiffies, rt->u.dst.expires))) {
Al Viro8c7bc842006-09-26 21:26:19 -07001521 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
Denis V. Lunevb00180d2008-07-05 19:04:09 -07001522 rt->fl.oif,
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001523 rt_genid(dev_net(dst->dev)));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001524#if RT_CACHE_DEBUG >= 1
Harvey Harrison673d57e2008-10-31 00:53:57 -07001525 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1526 &rt->rt_dst, rt->fl.fl4_tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001527#endif
1528 rt_del(hash, rt);
1529 ret = NULL;
1530 }
1531 }
1532 return ret;
1533}
1534
1535/*
1536 * Algorithm:
1537 * 1. The first ip_rt_redirect_number redirects are sent
1538 * with exponential backoff, then we stop sending them at all,
1539 * assuming that the host ignores our redirects.
1540 * 2. If we did not see packets requiring redirects
1541 * during ip_rt_redirect_silence, we assume that the host
1542 * forgot redirected route and start to send redirects again.
1543 *
1544 * This algorithm is much cheaper and more intelligent than dumb load limiting
1545 * in icmp.c.
1546 *
1547 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1548 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1549 */
1550
1551void ip_rt_send_redirect(struct sk_buff *skb)
1552{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001553 struct rtable *rt = skb_rtable(skb);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001554 struct in_device *in_dev;
1555 int log_martians;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001556
Eric Dumazet30038fc2009-08-28 23:52:01 -07001557 rcu_read_lock();
1558 in_dev = __in_dev_get_rcu(rt->u.dst.dev);
1559 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1560 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001561 return;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001562 }
1563 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1564 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001565
1566 /* No redirected packets during ip_rt_redirect_silence;
1567 * reset the algorithm.
1568 */
1569 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1570 rt->u.dst.rate_tokens = 0;
1571
1572 /* Too many ignored redirects; do not send anything
1573 * set u.dst.rate_last to the last seen redirected packet.
1574 */
1575 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1576 rt->u.dst.rate_last = jiffies;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001577 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001578 }
1579
1580 /* Check for load limit; set rate_last to the latest sent
1581 * redirect.
1582 */
Li Yewang14fb8a72006-12-18 00:26:35 -08001583 if (rt->u.dst.rate_tokens == 0 ||
1584 time_after(jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001585 (rt->u.dst.rate_last +
1586 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1587 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1588 rt->u.dst.rate_last = jiffies;
1589 ++rt->u.dst.rate_tokens;
1590#ifdef CONFIG_IP_ROUTE_VERBOSE
Eric Dumazet30038fc2009-08-28 23:52:01 -07001591 if (log_martians &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001592 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1593 net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07001594 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1595 &rt->rt_src, rt->rt_iif,
1596 &rt->rt_dst, &rt->rt_gateway);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001597#endif
1598 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001599}
1600
1601static int ip_error(struct sk_buff *skb)
1602{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001603 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001604 unsigned long now;
1605 int code;
1606
1607 switch (rt->u.dst.error) {
1608 case EINVAL:
1609 default:
1610 goto out;
1611 case EHOSTUNREACH:
1612 code = ICMP_HOST_UNREACH;
1613 break;
1614 case ENETUNREACH:
1615 code = ICMP_NET_UNREACH;
Pavel Emelyanov7c73a6f2008-07-16 20:20:11 -07001616 IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1617 IPSTATS_MIB_INNOROUTES);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001618 break;
1619 case EACCES:
1620 code = ICMP_PKT_FILTERED;
1621 break;
1622 }
1623
1624 now = jiffies;
1625 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1626 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1627 rt->u.dst.rate_tokens = ip_rt_error_burst;
1628 rt->u.dst.rate_last = now;
1629 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1630 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1631 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1632 }
1633
1634out: kfree_skb(skb);
1635 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001636}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001637
1638/*
1639 * The last two values are not from the RFC but
1640 * are needed for AMPRnet AX.25 paths.
1641 */
1642
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001643static const unsigned short mtu_plateau[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -07001644{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1645
Stephen Hemminger5969f712008-04-10 01:52:09 -07001646static inline unsigned short guess_mtu(unsigned short old_mtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001647{
1648 int i;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001649
Linus Torvalds1da177e2005-04-16 15:20:36 -07001650 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1651 if (old_mtu > mtu_plateau[i])
1652 return mtu_plateau[i];
1653 return 68;
1654}
1655
Denis V. Lunevb5921912008-01-22 23:50:25 -08001656unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
Timo Teras0010e462008-04-29 03:32:25 -07001657 unsigned short new_mtu,
1658 struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001659{
Timo Teras0010e462008-04-29 03:32:25 -07001660 int i, k;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001661 unsigned short old_mtu = ntohs(iph->tot_len);
1662 struct rtable *rth;
Timo Teras0010e462008-04-29 03:32:25 -07001663 int ikeys[2] = { dev->ifindex, 0 };
Al Viroe4485152006-09-26 22:15:01 -07001664 __be32 skeys[2] = { iph->saddr, 0, };
1665 __be32 daddr = iph->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001666 unsigned short est_mtu = 0;
1667
Timo Teras0010e462008-04-29 03:32:25 -07001668 for (k = 0; k < 2; k++) {
1669 for (i = 0; i < 2; i++) {
Denis V. Lunevb00180d2008-07-05 19:04:09 -07001670 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001671 rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001672
Timo Teras0010e462008-04-29 03:32:25 -07001673 rcu_read_lock();
1674 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1675 rth = rcu_dereference(rth->u.dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001676 unsigned short mtu = new_mtu;
1677
Timo Teras0010e462008-04-29 03:32:25 -07001678 if (rth->fl.fl4_dst != daddr ||
1679 rth->fl.fl4_src != skeys[i] ||
1680 rth->rt_dst != daddr ||
1681 rth->rt_src != iph->saddr ||
1682 rth->fl.oif != ikeys[k] ||
1683 rth->fl.iif != 0 ||
1684 dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1685 !net_eq(dev_net(rth->u.dst.dev), net) ||
Hugh Dickins6c3b8fc2008-07-26 17:51:06 -07001686 rt_is_expired(rth))
Timo Teras0010e462008-04-29 03:32:25 -07001687 continue;
1688
Linus Torvalds1da177e2005-04-16 15:20:36 -07001689 if (new_mtu < 68 || new_mtu >= old_mtu) {
1690
1691 /* BSD 4.2 compatibility hack :-( */
1692 if (mtu == 0 &&
Rami Rosen6d273f82008-08-06 02:33:49 -07001693 old_mtu >= dst_mtu(&rth->u.dst) &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001694 old_mtu >= 68 + (iph->ihl << 2))
1695 old_mtu -= iph->ihl << 2;
1696
1697 mtu = guess_mtu(old_mtu);
1698 }
Rami Rosen6d273f82008-08-06 02:33:49 -07001699 if (mtu <= dst_mtu(&rth->u.dst)) {
1700 if (mtu < dst_mtu(&rth->u.dst)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001701 dst_confirm(&rth->u.dst);
1702 if (mtu < ip_rt_min_pmtu) {
1703 mtu = ip_rt_min_pmtu;
1704 rth->u.dst.metrics[RTAX_LOCK-1] |=
1705 (1 << RTAX_MTU);
1706 }
1707 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1708 dst_set_expires(&rth->u.dst,
1709 ip_rt_mtu_expires);
1710 }
1711 est_mtu = mtu;
1712 }
1713 }
Timo Teras0010e462008-04-29 03:32:25 -07001714 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001715 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001716 }
1717 return est_mtu ? : new_mtu;
1718}
1719
1720static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1721{
Rami Rosen6d273f82008-08-06 02:33:49 -07001722 if (dst_mtu(dst) > mtu && mtu >= 68 &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001723 !(dst_metric_locked(dst, RTAX_MTU))) {
1724 if (mtu < ip_rt_min_pmtu) {
1725 mtu = ip_rt_min_pmtu;
1726 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1727 }
1728 dst->metrics[RTAX_MTU-1] = mtu;
1729 dst_set_expires(dst, ip_rt_mtu_expires);
Tom Tucker8d717402006-07-30 20:43:36 -07001730 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001731 }
1732}
1733
1734static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1735{
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001736 if (rt_is_expired((struct rtable *)dst))
1737 return NULL;
1738 return dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001739}
1740
1741static void ipv4_dst_destroy(struct dst_entry *dst)
1742{
1743 struct rtable *rt = (struct rtable *) dst;
1744 struct inet_peer *peer = rt->peer;
1745 struct in_device *idev = rt->idev;
1746
1747 if (peer) {
1748 rt->peer = NULL;
1749 inet_putpeer(peer);
1750 }
1751
1752 if (idev) {
1753 rt->idev = NULL;
1754 in_dev_put(idev);
1755 }
1756}
1757
1758static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1759 int how)
1760{
1761 struct rtable *rt = (struct rtable *) dst;
1762 struct in_device *idev = rt->idev;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001763 if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
Denis V. Lunev5a3e55d2007-12-07 00:38:10 -08001764 struct in_device *loopback_idev =
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001765 in_dev_get(dev_net(dev)->loopback_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001766 if (loopback_idev) {
1767 rt->idev = loopback_idev;
1768 in_dev_put(idev);
1769 }
1770 }
1771}
1772
1773static void ipv4_link_failure(struct sk_buff *skb)
1774{
1775 struct rtable *rt;
1776
1777 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1778
Eric Dumazet511c3f92009-06-02 05:14:27 +00001779 rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001780 if (rt)
1781 dst_set_expires(&rt->u.dst, 0);
1782}
1783
1784static int ip_rt_bug(struct sk_buff *skb)
1785{
Harvey Harrison673d57e2008-10-31 00:53:57 -07001786 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1787 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001788 skb->dev ? skb->dev->name : "?");
1789 kfree_skb(skb);
1790 return 0;
1791}
1792
1793/*
1794 We do not cache source address of outgoing interface,
1795 because it is used only by IP RR, TS and SRR options,
1796 so that it out of fast path.
1797
1798 BTW remember: "addr" is allowed to be not aligned
1799 in IP options!
1800 */
1801
1802void ip_rt_get_source(u8 *addr, struct rtable *rt)
1803{
Al Viroa61ced52006-09-26 21:27:54 -07001804 __be32 src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001805 struct fib_result res;
1806
1807 if (rt->fl.iif == 0)
1808 src = rt->rt_src;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001809 else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001810 src = FIB_RES_PREFSRC(res);
1811 fib_res_put(&res);
1812 } else
1813 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1814 RT_SCOPE_UNIVERSE);
1815 memcpy(addr, &src, 4);
1816}
1817
1818#ifdef CONFIG_NET_CLS_ROUTE
1819static void set_class_tag(struct rtable *rt, u32 tag)
1820{
1821 if (!(rt->u.dst.tclassid & 0xFFFF))
1822 rt->u.dst.tclassid |= tag & 0xFFFF;
1823 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1824 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1825}
1826#endif
1827
1828static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1829{
1830 struct fib_info *fi = res->fi;
1831
1832 if (fi) {
1833 if (FIB_RES_GW(*res) &&
1834 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1835 rt->rt_gateway = FIB_RES_GW(*res);
1836 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1837 sizeof(rt->u.dst.metrics));
1838 if (fi->fib_mtu == 0) {
1839 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
Satoru SATOH0bbeafd2008-05-04 22:12:43 -07001840 if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001841 rt->rt_gateway != rt->rt_dst &&
1842 rt->u.dst.dev->mtu > 576)
1843 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1844 }
1845#ifdef CONFIG_NET_CLS_ROUTE
1846 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1847#endif
1848 } else
1849 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1850
Satoru SATOH5ffc02a2008-05-04 22:14:42 -07001851 if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001852 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
Rami Rosen6d273f82008-08-06 02:33:49 -07001853 if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001854 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
Satoru SATOH5ffc02a2008-05-04 22:14:42 -07001855 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001856 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1857 ip_rt_min_advmss);
Satoru SATOH5ffc02a2008-05-04 22:14:42 -07001858 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001859 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1860
1861#ifdef CONFIG_NET_CLS_ROUTE
1862#ifdef CONFIG_IP_MULTIPLE_TABLES
1863 set_class_tag(rt, fib_rules_tclass(res));
1864#endif
1865 set_class_tag(rt, itag);
1866#endif
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001867 rt->rt_type = res->type;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001868}
1869
Al Viro9e12bb22006-09-26 21:25:20 -07001870static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001871 u8 tos, struct net_device *dev, int our)
1872{
1873 unsigned hash;
1874 struct rtable *rth;
Al Viroa61ced52006-09-26 21:27:54 -07001875 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001876 struct in_device *in_dev = in_dev_get(dev);
1877 u32 itag = 0;
1878
1879 /* Primary sanity checks. */
1880
1881 if (in_dev == NULL)
1882 return -EINVAL;
1883
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001884 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08001885 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001886 goto e_inval;
1887
Joe Perchesf97c1e02007-12-16 13:45:43 -08001888 if (ipv4_is_zeronet(saddr)) {
1889 if (!ipv4_is_local_multicast(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001890 goto e_inval;
1891 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1892 } else if (fib_validate_source(saddr, 0, tos, 0,
jamalb0c110c2009-10-18 02:12:33 +00001893 dev, &spec_dst, &itag, 0) < 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001894 goto e_inval;
1895
1896 rth = dst_alloc(&ipv4_dst_ops);
1897 if (!rth)
1898 goto e_nobufs;
1899
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001900 rth->u.dst.output = ip_rt_bug;
1901 rth->u.dst.obsolete = -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001902
1903 atomic_set(&rth->u.dst.__refcnt, 1);
1904 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07001905 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001906 rth->u.dst.flags |= DST_NOPOLICY;
1907 rth->fl.fl4_dst = daddr;
1908 rth->rt_dst = daddr;
1909 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001910 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001911 rth->fl.fl4_src = saddr;
1912 rth->rt_src = saddr;
1913#ifdef CONFIG_NET_CLS_ROUTE
1914 rth->u.dst.tclassid = itag;
1915#endif
1916 rth->rt_iif =
1917 rth->fl.iif = dev->ifindex;
Eric W. Biederman2774c7a2007-09-26 22:10:56 -07001918 rth->u.dst.dev = init_net.loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001919 dev_hold(rth->u.dst.dev);
1920 rth->idev = in_dev_get(rth->u.dst.dev);
1921 rth->fl.oif = 0;
1922 rth->rt_gateway = daddr;
1923 rth->rt_spec_dst= spec_dst;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001924 rth->rt_genid = rt_genid(dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001925 rth->rt_flags = RTCF_MULTICAST;
Eric Dumazet29e75252008-01-31 17:05:09 -08001926 rth->rt_type = RTN_MULTICAST;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001927 if (our) {
1928 rth->u.dst.input= ip_local_deliver;
1929 rth->rt_flags |= RTCF_LOCAL;
1930 }
1931
1932#ifdef CONFIG_IP_MROUTE
Joe Perchesf97c1e02007-12-16 13:45:43 -08001933 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001934 rth->u.dst.input = ip_mr_input;
1935#endif
1936 RT_CACHE_STAT_INC(in_slow_mc);
1937
1938 in_dev_put(in_dev);
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001939 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001940 return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001941
1942e_nobufs:
1943 in_dev_put(in_dev);
1944 return -ENOBUFS;
1945
1946e_inval:
1947 in_dev_put(in_dev);
1948 return -EINVAL;
1949}
1950
1951
1952static void ip_handle_martian_source(struct net_device *dev,
1953 struct in_device *in_dev,
1954 struct sk_buff *skb,
Al Viro9e12bb22006-09-26 21:25:20 -07001955 __be32 daddr,
1956 __be32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001957{
1958 RT_CACHE_STAT_INC(in_martian_src);
1959#ifdef CONFIG_IP_ROUTE_VERBOSE
1960 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1961 /*
1962 * RFC1812 recommendation, if source is martian,
1963 * the only hint is MAC header.
1964 */
Harvey Harrison673d57e2008-10-31 00:53:57 -07001965 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1966 &daddr, &saddr, dev->name);
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001967 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001968 int i;
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001969 const unsigned char *p = skb_mac_header(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001970 printk(KERN_WARNING "ll header: ");
1971 for (i = 0; i < dev->hard_header_len; i++, p++) {
1972 printk("%02x", *p);
1973 if (i < (dev->hard_header_len - 1))
1974 printk(":");
1975 }
1976 printk("\n");
1977 }
1978 }
1979#endif
1980}
1981
Stephen Hemminger5969f712008-04-10 01:52:09 -07001982static int __mkroute_input(struct sk_buff *skb,
1983 struct fib_result *res,
1984 struct in_device *in_dev,
1985 __be32 daddr, __be32 saddr, u32 tos,
1986 struct rtable **result)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001987{
1988
1989 struct rtable *rth;
1990 int err;
1991 struct in_device *out_dev;
1992 unsigned flags = 0;
Al Virod9c9df82006-09-26 21:28:14 -07001993 __be32 spec_dst;
1994 u32 itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001995
1996 /* get a working reference to the output device */
1997 out_dev = in_dev_get(FIB_RES_DEV(*res));
1998 if (out_dev == NULL) {
1999 if (net_ratelimit())
2000 printk(KERN_CRIT "Bug in ip_route_input" \
2001 "_slow(). Please, report\n");
2002 return -EINVAL;
2003 }
2004
2005
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002006 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
jamalb0c110c2009-10-18 02:12:33 +00002007 in_dev->dev, &spec_dst, &itag, skb->mark);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002008 if (err < 0) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002009 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002010 saddr);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002011
Linus Torvalds1da177e2005-04-16 15:20:36 -07002012 err = -EINVAL;
2013 goto cleanup;
2014 }
2015
2016 if (err)
2017 flags |= RTCF_DIRECTSRC;
2018
Thomas Graf51b77ca2008-06-03 16:36:01 -07002019 if (out_dev == in_dev && err &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002020 (IN_DEV_SHARED_MEDIA(out_dev) ||
2021 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2022 flags |= RTCF_DOREDIRECT;
2023
2024 if (skb->protocol != htons(ETH_P_IP)) {
2025 /* Not IP (i.e. ARP). Do not create route, if it is
2026 * invalid for proxy arp. DNAT routes are always valid.
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00002027 *
2028 * Proxy arp feature have been extended to allow, ARP
2029 * replies back to the same interface, to support
2030 * Private VLAN switch technologies. See arp.c.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002031 */
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00002032 if (out_dev == in_dev &&
2033 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002034 err = -EINVAL;
2035 goto cleanup;
2036 }
2037 }
2038
2039
2040 rth = dst_alloc(&ipv4_dst_ops);
2041 if (!rth) {
2042 err = -ENOBUFS;
2043 goto cleanup;
2044 }
2045
Julian Anastasovce723d82005-09-08 13:34:47 -07002046 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002047 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07002048 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002049 rth->u.dst.flags |= DST_NOPOLICY;
Herbert Xu42f811b2007-06-04 23:34:44 -07002050 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002051 rth->u.dst.flags |= DST_NOXFRM;
2052 rth->fl.fl4_dst = daddr;
2053 rth->rt_dst = daddr;
2054 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002055 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002056 rth->fl.fl4_src = saddr;
2057 rth->rt_src = saddr;
2058 rth->rt_gateway = daddr;
2059 rth->rt_iif =
2060 rth->fl.iif = in_dev->dev->ifindex;
2061 rth->u.dst.dev = (out_dev)->dev;
2062 dev_hold(rth->u.dst.dev);
2063 rth->idev = in_dev_get(rth->u.dst.dev);
2064 rth->fl.oif = 0;
2065 rth->rt_spec_dst= spec_dst;
2066
Timo Teräsd11a4dc2010-03-18 23:20:20 +00002067 rth->u.dst.obsolete = -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002068 rth->u.dst.input = ip_forward;
2069 rth->u.dst.output = ip_output;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002070 rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002071
2072 rt_set_nexthop(rth, res, itag);
2073
2074 rth->rt_flags = flags;
2075
2076 *result = rth;
2077 err = 0;
2078 cleanup:
2079 /* release the working reference to the output device */
2080 in_dev_put(out_dev);
2081 return err;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002082}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002083
Stephen Hemminger5969f712008-04-10 01:52:09 -07002084static int ip_mkroute_input(struct sk_buff *skb,
2085 struct fib_result *res,
2086 const struct flowi *fl,
2087 struct in_device *in_dev,
2088 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002089{
Chuck Short7abaa272005-06-22 22:10:23 -07002090 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002091 int err;
2092 unsigned hash;
2093
2094#ifdef CONFIG_IP_ROUTE_MULTIPATH
2095 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2096 fib_select_multipath(fl, res);
2097#endif
2098
2099 /* create a routing cache entry */
2100 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2101 if (err)
2102 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002103
2104 /* put it into the cache */
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002105 hash = rt_hash(daddr, saddr, fl->iif,
2106 rt_genid(dev_net(rth->u.dst.dev)));
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00002107 return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002108}
2109
Linus Torvalds1da177e2005-04-16 15:20:36 -07002110/*
2111 * NOTE. We drop all the packets that has local source
2112 * addresses, because every properly looped back packet
2113 * must have correct destination already attached by output routine.
2114 *
2115 * Such approach solves two big problems:
2116 * 1. Not simplex devices are handled properly.
2117 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2118 */
2119
Al Viro9e12bb22006-09-26 21:25:20 -07002120static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002121 u8 tos, struct net_device *dev)
2122{
2123 struct fib_result res;
2124 struct in_device *in_dev = in_dev_get(dev);
2125 struct flowi fl = { .nl_u = { .ip4_u =
2126 { .daddr = daddr,
2127 .saddr = saddr,
2128 .tos = tos,
2129 .scope = RT_SCOPE_UNIVERSE,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002130 } },
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002131 .mark = skb->mark,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002132 .iif = dev->ifindex };
2133 unsigned flags = 0;
2134 u32 itag = 0;
2135 struct rtable * rth;
2136 unsigned hash;
Al Viro9e12bb22006-09-26 21:25:20 -07002137 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002138 int err = -EINVAL;
2139 int free_res = 0;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002140 struct net * net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002141
2142 /* IP on this device is disabled. */
2143
2144 if (!in_dev)
2145 goto out;
2146
2147 /* Check for the most weird martians, which can be not detected
2148 by fib_lookup.
2149 */
2150
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002151 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002152 ipv4_is_loopback(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002153 goto martian_source;
2154
Al Viroe4485152006-09-26 22:15:01 -07002155 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002156 goto brd_input;
2157
2158 /* Accept zero addresses only to limited broadcast;
2159 * I even do not know to fix it or not. Waiting for complains :-)
2160 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002161 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002162 goto martian_source;
2163
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002164 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002165 ipv4_is_loopback(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002166 goto martian_destination;
2167
2168 /*
2169 * Now we are ready to route packet.
2170 */
Denis V. Lunev84a885f2008-01-21 17:34:35 -08002171 if ((err = fib_lookup(net, &fl, &res)) != 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002172 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002173 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002174 goto no_route;
2175 }
2176 free_res = 1;
2177
2178 RT_CACHE_STAT_INC(in_slow_tot);
2179
2180 if (res.type == RTN_BROADCAST)
2181 goto brd_input;
2182
2183 if (res.type == RTN_LOCAL) {
2184 int result;
2185 result = fib_validate_source(saddr, daddr, tos,
Denis V. Lunev84a885f2008-01-21 17:34:35 -08002186 net->loopback_dev->ifindex,
jamalb0c110c2009-10-18 02:12:33 +00002187 dev, &spec_dst, &itag, skb->mark);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002188 if (result < 0)
2189 goto martian_source;
2190 if (result)
2191 flags |= RTCF_DIRECTSRC;
2192 spec_dst = daddr;
2193 goto local_input;
2194 }
2195
2196 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002197 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002198 if (res.type != RTN_UNICAST)
2199 goto martian_destination;
2200
2201 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002202done:
2203 in_dev_put(in_dev);
2204 if (free_res)
2205 fib_res_put(&res);
2206out: return err;
2207
2208brd_input:
2209 if (skb->protocol != htons(ETH_P_IP))
2210 goto e_inval;
2211
Joe Perchesf97c1e02007-12-16 13:45:43 -08002212 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002213 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2214 else {
2215 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
jamalb0c110c2009-10-18 02:12:33 +00002216 &itag, skb->mark);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002217 if (err < 0)
2218 goto martian_source;
2219 if (err)
2220 flags |= RTCF_DIRECTSRC;
2221 }
2222 flags |= RTCF_BROADCAST;
2223 res.type = RTN_BROADCAST;
2224 RT_CACHE_STAT_INC(in_brd);
2225
2226local_input:
2227 rth = dst_alloc(&ipv4_dst_ops);
2228 if (!rth)
2229 goto e_nobufs;
2230
2231 rth->u.dst.output= ip_rt_bug;
Timo Teräsd11a4dc2010-03-18 23:20:20 +00002232 rth->u.dst.obsolete = -1;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002233 rth->rt_genid = rt_genid(net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002234
2235 atomic_set(&rth->u.dst.__refcnt, 1);
2236 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07002237 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002238 rth->u.dst.flags |= DST_NOPOLICY;
2239 rth->fl.fl4_dst = daddr;
2240 rth->rt_dst = daddr;
2241 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002242 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002243 rth->fl.fl4_src = saddr;
2244 rth->rt_src = saddr;
2245#ifdef CONFIG_NET_CLS_ROUTE
2246 rth->u.dst.tclassid = itag;
2247#endif
2248 rth->rt_iif =
2249 rth->fl.iif = dev->ifindex;
Denis V. Lunev84a885f2008-01-21 17:34:35 -08002250 rth->u.dst.dev = net->loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002251 dev_hold(rth->u.dst.dev);
2252 rth->idev = in_dev_get(rth->u.dst.dev);
2253 rth->rt_gateway = daddr;
2254 rth->rt_spec_dst= spec_dst;
2255 rth->u.dst.input= ip_local_deliver;
2256 rth->rt_flags = flags|RTCF_LOCAL;
2257 if (res.type == RTN_UNREACHABLE) {
2258 rth->u.dst.input= ip_error;
2259 rth->u.dst.error= -err;
2260 rth->rt_flags &= ~RTCF_LOCAL;
2261 }
2262 rth->rt_type = res.type;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002263 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00002264 err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002265 goto done;
2266
2267no_route:
2268 RT_CACHE_STAT_INC(in_no_route);
2269 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2270 res.type = RTN_UNREACHABLE;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08002271 if (err == -ESRCH)
2272 err = -ENETUNREACH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002273 goto local_input;
2274
2275 /*
2276 * Do not cache martian addresses: they should be logged (RFC1812)
2277 */
2278martian_destination:
2279 RT_CACHE_STAT_INC(in_martian_dst);
2280#ifdef CONFIG_IP_ROUTE_VERBOSE
2281 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07002282 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2283 &daddr, &saddr, dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002284#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002285
2286e_hostunreach:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002287 err = -EHOSTUNREACH;
2288 goto done;
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002289
Linus Torvalds1da177e2005-04-16 15:20:36 -07002290e_inval:
2291 err = -EINVAL;
2292 goto done;
2293
2294e_nobufs:
2295 err = -ENOBUFS;
2296 goto done;
2297
2298martian_source:
2299 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2300 goto e_inval;
2301}
2302
Al Viro9e12bb22006-09-26 21:25:20 -07002303int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002304 u8 tos, struct net_device *dev)
2305{
2306 struct rtable * rth;
2307 unsigned hash;
2308 int iif = dev->ifindex;
Denis V. Lunevb5921912008-01-22 23:50:25 -08002309 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002310
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002311 net = dev_net(dev);
Neil Horman1080d702008-10-27 12:28:25 -07002312
2313 if (!rt_caching(net))
2314 goto skip_cache;
2315
Linus Torvalds1da177e2005-04-16 15:20:36 -07002316 tos &= IPTOS_RT_MASK;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002317 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002318
2319 rcu_read_lock();
2320 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08002321 rth = rcu_dereference(rth->u.dst.rt_next)) {
Stephen Hemmingerc0b8c322008-04-10 04:00:28 -07002322 if (((rth->fl.fl4_dst ^ daddr) |
2323 (rth->fl.fl4_src ^ saddr) |
2324 (rth->fl.iif ^ iif) |
2325 rth->fl.oif |
2326 (rth->fl.fl4_tos ^ tos)) == 0 &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002327 rth->fl.mark == skb->mark &&
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09002328 net_eq(dev_net(rth->u.dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002329 !rt_is_expired(rth)) {
Pavel Emelyanov03f49f32007-11-10 21:28:34 -08002330 dst_use(&rth->u.dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002331 RT_CACHE_STAT_INC(in_hit);
2332 rcu_read_unlock();
Eric Dumazetadf30902009-06-02 05:19:30 +00002333 skb_dst_set(skb, &rth->u.dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002334 return 0;
2335 }
2336 RT_CACHE_STAT_INC(in_hlist_search);
2337 }
2338 rcu_read_unlock();
2339
Neil Horman1080d702008-10-27 12:28:25 -07002340skip_cache:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002341 /* Multicast recognition logic is moved from route cache to here.
2342 The problem was that too many Ethernet cards have broken/missing
2343 hardware multicast filters :-( As result the host on multicasting
2344 network acquires a lot of useless route cache entries, sort of
2345 SDR messages from all the world. Now we try to get rid of them.
2346 Really, provided software IP multicast filter is organized
2347 reasonably (at least, hashed), it does not result in a slowdown
2348 comparing with route cache reject entries.
2349 Note, that multicast routers are not affected, because
2350 route cache entry is created eventually.
2351 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002352 if (ipv4_is_multicast(daddr)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002353 struct in_device *in_dev;
2354
2355 rcu_read_lock();
Herbert Xue5ed6392005-10-03 14:35:55 -07002356 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002357 int our = ip_check_mc(in_dev, daddr, saddr,
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07002358 ip_hdr(skb)->protocol);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002359 if (our
2360#ifdef CONFIG_IP_MROUTE
Joe Perches9d4fb272009-11-23 10:41:23 -08002361 ||
2362 (!ipv4_is_local_multicast(daddr) &&
2363 IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002364#endif
Joe Perches9d4fb272009-11-23 10:41:23 -08002365 ) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002366 rcu_read_unlock();
2367 return ip_route_input_mc(skb, daddr, saddr,
2368 tos, dev, our);
2369 }
2370 }
2371 rcu_read_unlock();
2372 return -EINVAL;
2373 }
2374 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2375}
2376
Stephen Hemminger5969f712008-04-10 01:52:09 -07002377static int __mkroute_output(struct rtable **result,
2378 struct fib_result *res,
2379 const struct flowi *fl,
2380 const struct flowi *oldflp,
2381 struct net_device *dev_out,
2382 unsigned flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002383{
2384 struct rtable *rth;
2385 struct in_device *in_dev;
2386 u32 tos = RT_FL_TOS(oldflp);
2387 int err = 0;
2388
Joe Perchesf97c1e02007-12-16 13:45:43 -08002389 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002390 return -EINVAL;
2391
Al Viroe4485152006-09-26 22:15:01 -07002392 if (fl->fl4_dst == htonl(0xFFFFFFFF))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002393 res->type = RTN_BROADCAST;
Joe Perchesf97c1e02007-12-16 13:45:43 -08002394 else if (ipv4_is_multicast(fl->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002395 res->type = RTN_MULTICAST;
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002396 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002397 return -EINVAL;
2398
2399 if (dev_out->flags & IFF_LOOPBACK)
2400 flags |= RTCF_LOCAL;
2401
2402 /* get work reference to inet device */
2403 in_dev = in_dev_get(dev_out);
2404 if (!in_dev)
2405 return -EINVAL;
2406
2407 if (res->type == RTN_BROADCAST) {
2408 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2409 if (res->fi) {
2410 fib_info_put(res->fi);
2411 res->fi = NULL;
2412 }
2413 } else if (res->type == RTN_MULTICAST) {
2414 flags |= RTCF_MULTICAST|RTCF_LOCAL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002415 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002416 oldflp->proto))
2417 flags &= ~RTCF_LOCAL;
2418 /* If multicast route do not exist use
2419 default one, but do not gateway in this case.
2420 Yes, it is hack.
2421 */
2422 if (res->fi && res->prefixlen < 4) {
2423 fib_info_put(res->fi);
2424 res->fi = NULL;
2425 }
2426 }
2427
2428
2429 rth = dst_alloc(&ipv4_dst_ops);
2430 if (!rth) {
2431 err = -ENOBUFS;
2432 goto cleanup;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002433 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002434
Julian Anastasovce723d82005-09-08 13:34:47 -07002435 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002436 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07002437 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002438 rth->u.dst.flags |= DST_NOXFRM;
Herbert Xu42f811b2007-06-04 23:34:44 -07002439 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002440 rth->u.dst.flags |= DST_NOPOLICY;
2441
2442 rth->fl.fl4_dst = oldflp->fl4_dst;
2443 rth->fl.fl4_tos = tos;
2444 rth->fl.fl4_src = oldflp->fl4_src;
2445 rth->fl.oif = oldflp->oif;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002446 rth->fl.mark = oldflp->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002447 rth->rt_dst = fl->fl4_dst;
2448 rth->rt_src = fl->fl4_src;
2449 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002450 /* get references to the devices that are to be hold by the routing
Linus Torvalds1da177e2005-04-16 15:20:36 -07002451 cache entry */
2452 rth->u.dst.dev = dev_out;
2453 dev_hold(dev_out);
2454 rth->idev = in_dev_get(dev_out);
2455 rth->rt_gateway = fl->fl4_dst;
2456 rth->rt_spec_dst= fl->fl4_src;
2457
2458 rth->u.dst.output=ip_output;
Timo Teräsd11a4dc2010-03-18 23:20:20 +00002459 rth->u.dst.obsolete = -1;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002460 rth->rt_genid = rt_genid(dev_net(dev_out));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002461
2462 RT_CACHE_STAT_INC(out_slow_tot);
2463
2464 if (flags & RTCF_LOCAL) {
2465 rth->u.dst.input = ip_local_deliver;
2466 rth->rt_spec_dst = fl->fl4_dst;
2467 }
2468 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2469 rth->rt_spec_dst = fl->fl4_src;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002470 if (flags & RTCF_LOCAL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002471 !(dev_out->flags & IFF_LOOPBACK)) {
2472 rth->u.dst.output = ip_mc_output;
2473 RT_CACHE_STAT_INC(out_slow_mc);
2474 }
2475#ifdef CONFIG_IP_MROUTE
2476 if (res->type == RTN_MULTICAST) {
2477 if (IN_DEV_MFORWARD(in_dev) &&
Joe Perchesf97c1e02007-12-16 13:45:43 -08002478 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002479 rth->u.dst.input = ip_mr_input;
2480 rth->u.dst.output = ip_mc_output;
2481 }
2482 }
2483#endif
2484 }
2485
2486 rt_set_nexthop(rth, res, 0);
2487
2488 rth->rt_flags = flags;
2489
2490 *result = rth;
2491 cleanup:
2492 /* release work reference to inet device */
2493 in_dev_put(in_dev);
2494
2495 return err;
2496}
2497
Stephen Hemminger5969f712008-04-10 01:52:09 -07002498static int ip_mkroute_output(struct rtable **rp,
2499 struct fib_result *res,
2500 const struct flowi *fl,
2501 const struct flowi *oldflp,
2502 struct net_device *dev_out,
2503 unsigned flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002504{
Chuck Short7abaa272005-06-22 22:10:23 -07002505 struct rtable *rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002506 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2507 unsigned hash;
2508 if (err == 0) {
Denis V. Lunevb00180d2008-07-05 19:04:09 -07002509 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002510 rt_genid(dev_net(dev_out)));
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00002511 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002512 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002513
Linus Torvalds1da177e2005-04-16 15:20:36 -07002514 return err;
2515}
2516
Linus Torvalds1da177e2005-04-16 15:20:36 -07002517/*
2518 * Major route resolver routine.
2519 */
2520
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002521static int ip_route_output_slow(struct net *net, struct rtable **rp,
2522 const struct flowi *oldflp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002523{
2524 u32 tos = RT_FL_TOS(oldflp);
2525 struct flowi fl = { .nl_u = { .ip4_u =
2526 { .daddr = oldflp->fl4_dst,
2527 .saddr = oldflp->fl4_src,
2528 .tos = tos & IPTOS_RT_MASK,
2529 .scope = ((tos & RTO_ONLINK) ?
2530 RT_SCOPE_LINK :
2531 RT_SCOPE_UNIVERSE),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002532 } },
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002533 .mark = oldflp->mark,
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002534 .iif = net->loopback_dev->ifindex,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002535 .oif = oldflp->oif };
2536 struct fib_result res;
2537 unsigned flags = 0;
2538 struct net_device *dev_out = NULL;
2539 int free_res = 0;
2540 int err;
2541
2542
2543 res.fi = NULL;
2544#ifdef CONFIG_IP_MULTIPLE_TABLES
2545 res.r = NULL;
2546#endif
2547
2548 if (oldflp->fl4_src) {
2549 err = -EINVAL;
Joe Perchesf97c1e02007-12-16 13:45:43 -08002550 if (ipv4_is_multicast(oldflp->fl4_src) ||
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002551 ipv4_is_lbcast(oldflp->fl4_src) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002552 ipv4_is_zeronet(oldflp->fl4_src))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002553 goto out;
2554
Linus Torvalds1da177e2005-04-16 15:20:36 -07002555 /* I removed check for oif == dev_out->oif here.
2556 It was wrong for two reasons:
Denis V. Lunev1ab35272008-01-22 22:04:30 -08002557 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2558 is assigned to multiple interfaces.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002559 2. Moreover, we are allowed to send packets with saddr
2560 of another iface. --ANK
2561 */
2562
Joe Perches9d4fb272009-11-23 10:41:23 -08002563 if (oldflp->oif == 0 &&
2564 (ipv4_is_multicast(oldflp->fl4_dst) ||
2565 oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
Julian Anastasova210d012008-10-01 07:28:28 -07002566 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2567 dev_out = ip_dev_find(net, oldflp->fl4_src);
2568 if (dev_out == NULL)
2569 goto out;
2570
Linus Torvalds1da177e2005-04-16 15:20:36 -07002571 /* Special hack: user can direct multicasts
2572 and limited broadcast via necessary interface
2573 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2574 This hack is not just for fun, it allows
2575 vic,vat and friends to work.
2576 They bind socket to loopback, set ttl to zero
2577 and expect that it will work.
2578 From the viewpoint of routing cache they are broken,
2579 because we are not allowed to build multicast path
2580 with loopback source addr (look, routing cache
2581 cannot know, that ttl is zero, so that packet
2582 will not leave this host and route is valid).
2583 Luckily, this hack is good workaround.
2584 */
2585
2586 fl.oif = dev_out->ifindex;
2587 goto make_route;
2588 }
Julian Anastasova210d012008-10-01 07:28:28 -07002589
2590 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2591 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2592 dev_out = ip_dev_find(net, oldflp->fl4_src);
2593 if (dev_out == NULL)
2594 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002595 dev_put(dev_out);
Julian Anastasova210d012008-10-01 07:28:28 -07002596 dev_out = NULL;
2597 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002598 }
2599
2600
2601 if (oldflp->oif) {
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002602 dev_out = dev_get_by_index(net, oldflp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002603 err = -ENODEV;
2604 if (dev_out == NULL)
2605 goto out;
Herbert Xue5ed6392005-10-03 14:35:55 -07002606
2607 /* RACE: Check return value of inet_select_addr instead. */
2608 if (__in_dev_get_rtnl(dev_out) == NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002609 dev_put(dev_out);
2610 goto out; /* Wrong error code */
2611 }
2612
Joe Perchesf97c1e02007-12-16 13:45:43 -08002613 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2614 oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002615 if (!fl.fl4_src)
2616 fl.fl4_src = inet_select_addr(dev_out, 0,
2617 RT_SCOPE_LINK);
2618 goto make_route;
2619 }
2620 if (!fl.fl4_src) {
Joe Perchesf97c1e02007-12-16 13:45:43 -08002621 if (ipv4_is_multicast(oldflp->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002622 fl.fl4_src = inet_select_addr(dev_out, 0,
2623 fl.fl4_scope);
2624 else if (!oldflp->fl4_dst)
2625 fl.fl4_src = inet_select_addr(dev_out, 0,
2626 RT_SCOPE_HOST);
2627 }
2628 }
2629
2630 if (!fl.fl4_dst) {
2631 fl.fl4_dst = fl.fl4_src;
2632 if (!fl.fl4_dst)
2633 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2634 if (dev_out)
2635 dev_put(dev_out);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002636 dev_out = net->loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002637 dev_hold(dev_out);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002638 fl.oif = net->loopback_dev->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002639 res.type = RTN_LOCAL;
2640 flags |= RTCF_LOCAL;
2641 goto make_route;
2642 }
2643
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002644 if (fib_lookup(net, &fl, &res)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002645 res.fi = NULL;
2646 if (oldflp->oif) {
2647 /* Apparently, routing tables are wrong. Assume,
2648 that the destination is on link.
2649
2650 WHY? DW.
2651 Because we are allowed to send to iface
2652 even if it has NO routes and NO assigned
2653 addresses. When oif is specified, routing
2654 tables are looked up with only one purpose:
2655 to catch if destination is gatewayed, rather than
2656 direct. Moreover, if MSG_DONTROUTE is set,
2657 we send packet, ignoring both routing tables
2658 and ifaddr state. --ANK
2659
2660
2661 We could make it even if oif is unknown,
2662 likely IPv6, but we do not.
2663 */
2664
2665 if (fl.fl4_src == 0)
2666 fl.fl4_src = inet_select_addr(dev_out, 0,
2667 RT_SCOPE_LINK);
2668 res.type = RTN_UNICAST;
2669 goto make_route;
2670 }
2671 if (dev_out)
2672 dev_put(dev_out);
2673 err = -ENETUNREACH;
2674 goto out;
2675 }
2676 free_res = 1;
2677
2678 if (res.type == RTN_LOCAL) {
2679 if (!fl.fl4_src)
2680 fl.fl4_src = fl.fl4_dst;
2681 if (dev_out)
2682 dev_put(dev_out);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002683 dev_out = net->loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002684 dev_hold(dev_out);
2685 fl.oif = dev_out->ifindex;
2686 if (res.fi)
2687 fib_info_put(res.fi);
2688 res.fi = NULL;
2689 flags |= RTCF_LOCAL;
2690 goto make_route;
2691 }
2692
2693#ifdef CONFIG_IP_ROUTE_MULTIPATH
2694 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2695 fib_select_multipath(&fl, &res);
2696 else
2697#endif
2698 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002699 fib_select_default(net, &fl, &res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002700
2701 if (!fl.fl4_src)
2702 fl.fl4_src = FIB_RES_PREFSRC(res);
2703
2704 if (dev_out)
2705 dev_put(dev_out);
2706 dev_out = FIB_RES_DEV(res);
2707 dev_hold(dev_out);
2708 fl.oif = dev_out->ifindex;
2709
2710
2711make_route:
2712 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2713
2714
2715 if (free_res)
2716 fib_res_put(&res);
2717 if (dev_out)
2718 dev_put(dev_out);
2719out: return err;
2720}
2721
Denis V. Lunev611c1832008-01-22 22:06:48 -08002722int __ip_route_output_key(struct net *net, struct rtable **rp,
2723 const struct flowi *flp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002724{
2725 unsigned hash;
2726 struct rtable *rth;
2727
Neil Horman1080d702008-10-27 12:28:25 -07002728 if (!rt_caching(net))
2729 goto slow_output;
2730
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002731 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002732
2733 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08002734 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2735 rth = rcu_dereference_bh(rth->u.dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002736 if (rth->fl.fl4_dst == flp->fl4_dst &&
2737 rth->fl.fl4_src == flp->fl4_src &&
2738 rth->fl.iif == 0 &&
2739 rth->fl.oif == flp->oif &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002740 rth->fl.mark == flp->mark &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002741 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
Denis V. Lunevb5921912008-01-22 23:50:25 -08002742 (IPTOS_RT_MASK | RTO_ONLINK)) &&
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09002743 net_eq(dev_net(rth->u.dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002744 !rt_is_expired(rth)) {
Pavel Emelyanov03f49f32007-11-10 21:28:34 -08002745 dst_use(&rth->u.dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002746 RT_CACHE_STAT_INC(out_hit);
2747 rcu_read_unlock_bh();
2748 *rp = rth;
2749 return 0;
2750 }
2751 RT_CACHE_STAT_INC(out_hlist_search);
2752 }
2753 rcu_read_unlock_bh();
2754
Neil Horman1080d702008-10-27 12:28:25 -07002755slow_output:
Denis V. Lunev611c1832008-01-22 22:06:48 -08002756 return ip_route_output_slow(net, rp, flp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002757}
2758
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002759EXPORT_SYMBOL_GPL(__ip_route_output_key);
2760
David S. Miller14e50e52007-05-24 18:17:54 -07002761static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2762{
2763}
2764
2765static struct dst_ops ipv4_dst_blackhole_ops = {
2766 .family = AF_INET,
Harvey Harrison09640e62009-02-01 00:45:17 -08002767 .protocol = cpu_to_be16(ETH_P_IP),
David S. Miller14e50e52007-05-24 18:17:54 -07002768 .destroy = ipv4_dst_destroy,
2769 .check = ipv4_dst_check,
2770 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
Eric Dumazete2422972008-01-30 20:07:45 -08002771 .entries = ATOMIC_INIT(0),
David S. Miller14e50e52007-05-24 18:17:54 -07002772};
2773
2774
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002775static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
David S. Miller14e50e52007-05-24 18:17:54 -07002776{
2777 struct rtable *ort = *rp;
2778 struct rtable *rt = (struct rtable *)
2779 dst_alloc(&ipv4_dst_blackhole_ops);
2780
2781 if (rt) {
2782 struct dst_entry *new = &rt->u.dst;
2783
2784 atomic_set(&new->__refcnt, 1);
2785 new->__use = 1;
Herbert Xu352e5122007-11-13 21:34:06 -08002786 new->input = dst_discard;
2787 new->output = dst_discard;
David S. Miller14e50e52007-05-24 18:17:54 -07002788 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2789
2790 new->dev = ort->u.dst.dev;
2791 if (new->dev)
2792 dev_hold(new->dev);
2793
2794 rt->fl = ort->fl;
2795
2796 rt->idev = ort->idev;
2797 if (rt->idev)
2798 in_dev_hold(rt->idev);
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002799 rt->rt_genid = rt_genid(net);
David S. Miller14e50e52007-05-24 18:17:54 -07002800 rt->rt_flags = ort->rt_flags;
2801 rt->rt_type = ort->rt_type;
2802 rt->rt_dst = ort->rt_dst;
2803 rt->rt_src = ort->rt_src;
2804 rt->rt_iif = ort->rt_iif;
2805 rt->rt_gateway = ort->rt_gateway;
2806 rt->rt_spec_dst = ort->rt_spec_dst;
2807 rt->peer = ort->peer;
2808 if (rt->peer)
2809 atomic_inc(&rt->peer->refcnt);
2810
2811 dst_free(new);
2812 }
2813
2814 dst_release(&(*rp)->u.dst);
2815 *rp = rt;
2816 return (rt ? 0 : -ENOMEM);
2817}
2818
Denis V. Lunevf1b050b2008-01-22 22:07:10 -08002819int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2820 struct sock *sk, int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002821{
2822 int err;
2823
Denis V. Lunevf1b050b2008-01-22 22:07:10 -08002824 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002825 return err;
2826
2827 if (flp->proto) {
2828 if (!flp->fl4_src)
2829 flp->fl4_src = (*rp)->rt_src;
2830 if (!flp->fl4_dst)
2831 flp->fl4_dst = (*rp)->rt_dst;
Alexey Dobriyan52479b62008-11-25 17:35:18 -08002832 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
Herbert Xubb728452007-12-12 18:48:58 -08002833 flags ? XFRM_LOOKUP_WAIT : 0);
David S. Miller14e50e52007-05-24 18:17:54 -07002834 if (err == -EREMOTE)
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002835 err = ipv4_dst_blackhole(net, rp, flp);
David S. Miller14e50e52007-05-24 18:17:54 -07002836
2837 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002838 }
2839
2840 return 0;
2841}
2842
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002843EXPORT_SYMBOL_GPL(ip_route_output_flow);
2844
Denis V. Lunevf2063512008-01-22 22:07:34 -08002845int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002846{
Denis V. Lunevf2063512008-01-22 22:07:34 -08002847 return ip_route_output_flow(net, rp, flp, NULL, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002848}
2849
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002850static int rt_fill_info(struct net *net,
2851 struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002852 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002853{
Eric Dumazet511c3f92009-06-02 05:14:27 +00002854 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002855 struct rtmsg *r;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002856 struct nlmsghdr *nlh;
Thomas Grafe3703b32006-11-27 09:27:07 -08002857 long expires;
2858 u32 id = 0, ts = 0, tsage = 0, error;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002859
2860 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2861 if (nlh == NULL)
Patrick McHardy26932562007-01-31 23:16:40 -08002862 return -EMSGSIZE;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002863
2864 r = nlmsg_data(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002865 r->rtm_family = AF_INET;
2866 r->rtm_dst_len = 32;
2867 r->rtm_src_len = 0;
2868 r->rtm_tos = rt->fl.fl4_tos;
2869 r->rtm_table = RT_TABLE_MAIN;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002870 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002871 r->rtm_type = rt->rt_type;
2872 r->rtm_scope = RT_SCOPE_UNIVERSE;
2873 r->rtm_protocol = RTPROT_UNSPEC;
2874 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2875 if (rt->rt_flags & RTCF_NOTIFY)
2876 r->rtm_flags |= RTM_F_NOTIFY;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002877
Al Viro17fb2c62006-09-26 22:15:25 -07002878 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002879
Linus Torvalds1da177e2005-04-16 15:20:36 -07002880 if (rt->fl.fl4_src) {
2881 r->rtm_src_len = 32;
Al Viro17fb2c62006-09-26 22:15:25 -07002882 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002883 }
2884 if (rt->u.dst.dev)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002885 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002886#ifdef CONFIG_NET_CLS_ROUTE
2887 if (rt->u.dst.tclassid)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002888 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002889#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002890 if (rt->fl.iif)
Al Viro17fb2c62006-09-26 22:15:25 -07002891 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002892 else if (rt->rt_src != rt->fl.fl4_src)
Al Viro17fb2c62006-09-26 22:15:25 -07002893 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002894
Linus Torvalds1da177e2005-04-16 15:20:36 -07002895 if (rt->rt_dst != rt->rt_gateway)
Al Viro17fb2c62006-09-26 22:15:25 -07002896 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002897
Linus Torvalds1da177e2005-04-16 15:20:36 -07002898 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002899 goto nla_put_failure;
2900
Thomas Grafe3703b32006-11-27 09:27:07 -08002901 error = rt->u.dst.error;
2902 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002903 if (rt->peer) {
Eric Dumazet2c1409a2009-11-12 09:33:09 +00002904 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002905 if (rt->peer->tcp_ts_stamp) {
Thomas Grafe3703b32006-11-27 09:27:07 -08002906 ts = rt->peer->tcp_ts;
James Morris9d729f72007-03-04 16:12:44 -08002907 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002908 }
2909 }
Thomas Grafbe403ea2006-08-17 18:15:17 -07002910
Linus Torvalds1da177e2005-04-16 15:20:36 -07002911 if (rt->fl.iif) {
2912#ifdef CONFIG_IP_MROUTE
Al Viroe4485152006-09-26 22:15:01 -07002913 __be32 dst = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002914
Joe Perchesf97c1e02007-12-16 13:45:43 -08002915 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002916 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2917 int err = ipmr_get_route(net, skb, r, nowait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002918 if (err <= 0) {
2919 if (!nowait) {
2920 if (err == 0)
2921 return 0;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002922 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002923 } else {
2924 if (err == -EMSGSIZE)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002925 goto nla_put_failure;
Thomas Grafe3703b32006-11-27 09:27:07 -08002926 error = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002927 }
2928 }
2929 } else
2930#endif
Thomas Grafbe403ea2006-08-17 18:15:17 -07002931 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002932 }
2933
Thomas Grafe3703b32006-11-27 09:27:07 -08002934 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2935 expires, error) < 0)
2936 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002937
Thomas Grafbe403ea2006-08-17 18:15:17 -07002938 return nlmsg_end(skb, nlh);
2939
2940nla_put_failure:
Patrick McHardy26932562007-01-31 23:16:40 -08002941 nlmsg_cancel(skb, nlh);
2942 return -EMSGSIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002943}
2944
Thomas Graf63f34442007-03-22 11:55:17 -07002945static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002946{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09002947 struct net *net = sock_net(in_skb->sk);
Thomas Grafd889ce32006-08-17 18:15:44 -07002948 struct rtmsg *rtm;
2949 struct nlattr *tb[RTA_MAX+1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07002950 struct rtable *rt = NULL;
Al Viro9e12bb22006-09-26 21:25:20 -07002951 __be32 dst = 0;
2952 __be32 src = 0;
2953 u32 iif;
Thomas Grafd889ce32006-08-17 18:15:44 -07002954 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002955 struct sk_buff *skb;
2956
Thomas Grafd889ce32006-08-17 18:15:44 -07002957 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2958 if (err < 0)
2959 goto errout;
2960
2961 rtm = nlmsg_data(nlh);
2962
Linus Torvalds1da177e2005-04-16 15:20:36 -07002963 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
Thomas Grafd889ce32006-08-17 18:15:44 -07002964 if (skb == NULL) {
2965 err = -ENOBUFS;
2966 goto errout;
2967 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002968
2969 /* Reserve room for dummy headers, this skb can pass
2970 through good chunk of routing engine.
2971 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07002972 skb_reset_mac_header(skb);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07002973 skb_reset_network_header(skb);
Stephen Hemmingerd2c962b2006-04-17 17:27:11 -07002974
2975 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07002976 ip_hdr(skb)->protocol = IPPROTO_ICMP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002977 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2978
Al Viro17fb2c62006-09-26 22:15:25 -07002979 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2980 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
Thomas Grafd889ce32006-08-17 18:15:44 -07002981 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002982
2983 if (iif) {
Thomas Grafd889ce32006-08-17 18:15:44 -07002984 struct net_device *dev;
2985
Denis V. Lunev19375042008-02-28 20:52:04 -08002986 dev = __dev_get_by_index(net, iif);
Thomas Grafd889ce32006-08-17 18:15:44 -07002987 if (dev == NULL) {
2988 err = -ENODEV;
2989 goto errout_free;
2990 }
2991
Linus Torvalds1da177e2005-04-16 15:20:36 -07002992 skb->protocol = htons(ETH_P_IP);
2993 skb->dev = dev;
2994 local_bh_disable();
2995 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2996 local_bh_enable();
Thomas Grafd889ce32006-08-17 18:15:44 -07002997
Eric Dumazet511c3f92009-06-02 05:14:27 +00002998 rt = skb_rtable(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07002999 if (err == 0 && rt->u.dst.error)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003000 err = -rt->u.dst.error;
3001 } else {
Thomas Grafd889ce32006-08-17 18:15:44 -07003002 struct flowi fl = {
3003 .nl_u = {
3004 .ip4_u = {
3005 .daddr = dst,
3006 .saddr = src,
3007 .tos = rtm->rtm_tos,
3008 },
3009 },
3010 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3011 };
Denis V. Lunev19375042008-02-28 20:52:04 -08003012 err = ip_route_output_key(net, &rt, &fl);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003013 }
Thomas Grafd889ce32006-08-17 18:15:44 -07003014
Linus Torvalds1da177e2005-04-16 15:20:36 -07003015 if (err)
Thomas Grafd889ce32006-08-17 18:15:44 -07003016 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003017
Eric Dumazetadf30902009-06-02 05:19:30 +00003018 skb_dst_set(skb, &rt->u.dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003019 if (rtm->rtm_flags & RTM_F_NOTIFY)
3020 rt->rt_flags |= RTCF_NOTIFY;
3021
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003022 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Denis V. Lunev19375042008-02-28 20:52:04 -08003023 RTM_NEWROUTE, 0, 0);
Thomas Grafd889ce32006-08-17 18:15:44 -07003024 if (err <= 0)
3025 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003026
Denis V. Lunev19375042008-02-28 20:52:04 -08003027 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
Thomas Grafd889ce32006-08-17 18:15:44 -07003028errout:
Thomas Graf2942e902006-08-15 00:30:25 -07003029 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003030
Thomas Grafd889ce32006-08-17 18:15:44 -07003031errout_free:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003032 kfree_skb(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07003033 goto errout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003034}
3035
3036int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3037{
3038 struct rtable *rt;
3039 int h, s_h;
3040 int idx, s_idx;
Denis V. Lunev19375042008-02-28 20:52:04 -08003041 struct net *net;
3042
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09003043 net = sock_net(skb->sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003044
3045 s_h = cb->args[0];
Eric Dumazetd8c92832008-01-07 21:52:14 -08003046 if (s_h < 0)
3047 s_h = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003048 s_idx = idx = cb->args[1];
Eric Dumazeta6272662008-08-28 01:11:25 -07003049 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3050 if (!rt_hash_table[h].chain)
3051 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003052 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08003053 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3054 rt = rcu_dereference_bh(rt->u.dst.rt_next), idx++) {
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09003055 if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003056 continue;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07003057 if (rt_is_expired(rt))
Eric Dumazet29e75252008-01-31 17:05:09 -08003058 continue;
Eric Dumazetadf30902009-06-02 05:19:30 +00003059 skb_dst_set(skb, dst_clone(&rt->u.dst));
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003060 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003061 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07003062 1, NLM_F_MULTI) <= 0) {
Eric Dumazetadf30902009-06-02 05:19:30 +00003063 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003064 rcu_read_unlock_bh();
3065 goto done;
3066 }
Eric Dumazetadf30902009-06-02 05:19:30 +00003067 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003068 }
3069 rcu_read_unlock_bh();
3070 }
3071
3072done:
3073 cb->args[0] = h;
3074 cb->args[1] = idx;
3075 return skb->len;
3076}
3077
3078void ip_rt_multicast_event(struct in_device *in_dev)
3079{
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -07003080 rt_cache_flush(dev_net(in_dev->dev), 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003081}
3082
3083#ifdef CONFIG_SYSCTL
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003084static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003085 void __user *buffer,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003086 size_t *lenp, loff_t *ppos)
3087{
3088 if (write) {
Denis V. Lunev639e1042008-07-05 19:02:06 -07003089 int flush_delay;
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003090 ctl_table ctl;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003091 struct net *net;
Denis V. Lunev639e1042008-07-05 19:02:06 -07003092
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003093 memcpy(&ctl, __ctl, sizeof(ctl));
3094 ctl.data = &flush_delay;
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003095 proc_dointvec(&ctl, write, buffer, lenp, ppos);
Denis V. Lunev639e1042008-07-05 19:02:06 -07003096
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003097 net = (struct net *)__ctl->extra1;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003098 rt_cache_flush(net, flush_delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003099 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003100 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003101
3102 return -EINVAL;
3103}
3104
Herbert Xuc6153b52008-08-15 13:44:31 -07003105static void rt_secret_reschedule(int old)
3106{
3107 struct net *net;
3108 int new = ip_rt_secret_interval;
3109 int diff = new - old;
3110
3111 if (!diff)
3112 return;
3113
3114 rtnl_lock();
3115 for_each_net(net) {
3116 int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
Vitaliy Gusev858a18a2010-03-16 01:07:51 +00003117 long time;
Herbert Xuc6153b52008-08-15 13:44:31 -07003118
3119 if (!new)
3120 continue;
3121
3122 if (deleted) {
Vitaliy Gusev858a18a2010-03-16 01:07:51 +00003123 time = net->ipv4.rt_secret_timer.expires - jiffies;
Herbert Xuc6153b52008-08-15 13:44:31 -07003124
3125 if (time <= 0 || (time += diff) <= 0)
3126 time = 0;
Herbert Xuc6153b52008-08-15 13:44:31 -07003127 } else
Vitaliy Gusev858a18a2010-03-16 01:07:51 +00003128 time = new;
Herbert Xuc6153b52008-08-15 13:44:31 -07003129
Vitaliy Gusev858a18a2010-03-16 01:07:51 +00003130 mod_timer(&net->ipv4.rt_secret_timer, jiffies + time);
Herbert Xuc6153b52008-08-15 13:44:31 -07003131 }
3132 rtnl_unlock();
3133}
3134
3135static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
Herbert Xuc6153b52008-08-15 13:44:31 -07003136 void __user *buffer, size_t *lenp,
3137 loff_t *ppos)
3138{
3139 int old = ip_rt_secret_interval;
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003140 int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
Herbert Xuc6153b52008-08-15 13:44:31 -07003141
3142 rt_secret_reschedule(old);
3143
3144 return ret;
3145}
3146
Al Viroeeb61f72008-07-27 08:59:33 +01003147static ctl_table ipv4_route_table[] = {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003148 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003149 .procname = "gc_thresh",
3150 .data = &ipv4_dst_ops.gc_thresh,
3151 .maxlen = sizeof(int),
3152 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003153 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003154 },
3155 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003156 .procname = "max_size",
3157 .data = &ip_rt_max_size,
3158 .maxlen = sizeof(int),
3159 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003160 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003161 },
3162 {
3163 /* Deprecated. Use gc_min_interval_ms */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003164
Linus Torvalds1da177e2005-04-16 15:20:36 -07003165 .procname = "gc_min_interval",
3166 .data = &ip_rt_gc_min_interval,
3167 .maxlen = sizeof(int),
3168 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003169 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003170 },
3171 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003172 .procname = "gc_min_interval_ms",
3173 .data = &ip_rt_gc_min_interval,
3174 .maxlen = sizeof(int),
3175 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003176 .proc_handler = proc_dointvec_ms_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003177 },
3178 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003179 .procname = "gc_timeout",
3180 .data = &ip_rt_gc_timeout,
3181 .maxlen = sizeof(int),
3182 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003183 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003184 },
3185 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003186 .procname = "gc_interval",
3187 .data = &ip_rt_gc_interval,
3188 .maxlen = sizeof(int),
3189 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003190 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003191 },
3192 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003193 .procname = "redirect_load",
3194 .data = &ip_rt_redirect_load,
3195 .maxlen = sizeof(int),
3196 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003197 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003198 },
3199 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003200 .procname = "redirect_number",
3201 .data = &ip_rt_redirect_number,
3202 .maxlen = sizeof(int),
3203 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003204 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003205 },
3206 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003207 .procname = "redirect_silence",
3208 .data = &ip_rt_redirect_silence,
3209 .maxlen = sizeof(int),
3210 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003211 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003212 },
3213 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003214 .procname = "error_cost",
3215 .data = &ip_rt_error_cost,
3216 .maxlen = sizeof(int),
3217 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003218 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003219 },
3220 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003221 .procname = "error_burst",
3222 .data = &ip_rt_error_burst,
3223 .maxlen = sizeof(int),
3224 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003225 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003226 },
3227 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003228 .procname = "gc_elasticity",
3229 .data = &ip_rt_gc_elasticity,
3230 .maxlen = sizeof(int),
3231 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003232 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003233 },
3234 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003235 .procname = "mtu_expires",
3236 .data = &ip_rt_mtu_expires,
3237 .maxlen = sizeof(int),
3238 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003239 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003240 },
3241 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003242 .procname = "min_pmtu",
3243 .data = &ip_rt_min_pmtu,
3244 .maxlen = sizeof(int),
3245 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003246 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003247 },
3248 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003249 .procname = "min_adv_mss",
3250 .data = &ip_rt_min_advmss,
3251 .maxlen = sizeof(int),
3252 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003253 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003254 },
3255 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003256 .procname = "secret_interval",
3257 .data = &ip_rt_secret_interval,
3258 .maxlen = sizeof(int),
3259 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003260 .proc_handler = ipv4_sysctl_rt_secret_interval,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003261 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003262 { }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003263};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003264
Al Viro2f4520d2008-08-25 15:17:44 -07003265static struct ctl_table empty[1];
3266
3267static struct ctl_table ipv4_skeleton[] =
3268{
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003269 { .procname = "route",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003270 .mode = 0555, .child = ipv4_route_table},
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003271 { .procname = "neigh",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003272 .mode = 0555, .child = empty},
Al Viro2f4520d2008-08-25 15:17:44 -07003273 { }
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003274};
3275
Al Viro2f4520d2008-08-25 15:17:44 -07003276static __net_initdata struct ctl_path ipv4_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003277 { .procname = "net", },
3278 { .procname = "ipv4", },
Al Viro2f4520d2008-08-25 15:17:44 -07003279 { },
3280};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003281
3282static struct ctl_table ipv4_route_flush_table[] = {
3283 {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003284 .procname = "flush",
3285 .maxlen = sizeof(int),
3286 .mode = 0200,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003287 .proc_handler = ipv4_sysctl_rtcache_flush,
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003288 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003289 { },
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003290};
3291
Al Viro2f4520d2008-08-25 15:17:44 -07003292static __net_initdata struct ctl_path ipv4_route_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003293 { .procname = "net", },
3294 { .procname = "ipv4", },
3295 { .procname = "route", },
Al Viro2f4520d2008-08-25 15:17:44 -07003296 { },
3297};
3298
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003299static __net_init int sysctl_route_net_init(struct net *net)
3300{
3301 struct ctl_table *tbl;
3302
3303 tbl = ipv4_route_flush_table;
Octavian Purdila09ad9bc2009-11-25 15:14:13 -08003304 if (!net_eq(net, &init_net)) {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003305 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3306 if (tbl == NULL)
3307 goto err_dup;
3308 }
3309 tbl[0].extra1 = net;
3310
3311 net->ipv4.route_hdr =
3312 register_net_sysctl_table(net, ipv4_route_path, tbl);
3313 if (net->ipv4.route_hdr == NULL)
3314 goto err_reg;
3315 return 0;
3316
3317err_reg:
3318 if (tbl != ipv4_route_flush_table)
3319 kfree(tbl);
3320err_dup:
3321 return -ENOMEM;
3322}
3323
3324static __net_exit void sysctl_route_net_exit(struct net *net)
3325{
3326 struct ctl_table *tbl;
3327
3328 tbl = net->ipv4.route_hdr->ctl_table_arg;
3329 unregister_net_sysctl_table(net->ipv4.route_hdr);
3330 BUG_ON(tbl == ipv4_route_flush_table);
3331 kfree(tbl);
3332}
3333
3334static __net_initdata struct pernet_operations sysctl_route_ops = {
3335 .init = sysctl_route_net_init,
3336 .exit = sysctl_route_net_exit,
3337};
Linus Torvalds1da177e2005-04-16 15:20:36 -07003338#endif
3339
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003340
3341static __net_init int rt_secret_timer_init(struct net *net)
3342{
Denis V. Luneve84f84f2008-07-05 19:04:32 -07003343 atomic_set(&net->ipv4.rt_genid,
3344 (int) ((num_physpages ^ (num_physpages>>8)) ^
3345 (jiffies ^ (jiffies >> 7))));
3346
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003347 net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3348 net->ipv4.rt_secret_timer.data = (unsigned long)net;
3349 init_timer_deferrable(&net->ipv4.rt_secret_timer);
3350
Herbert Xuc6153b52008-08-15 13:44:31 -07003351 if (ip_rt_secret_interval) {
3352 net->ipv4.rt_secret_timer.expires =
3353 jiffies + net_random() % ip_rt_secret_interval +
3354 ip_rt_secret_interval;
3355 add_timer(&net->ipv4.rt_secret_timer);
3356 }
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003357 return 0;
3358}
3359
3360static __net_exit void rt_secret_timer_exit(struct net *net)
3361{
3362 del_timer_sync(&net->ipv4.rt_secret_timer);
3363}
3364
3365static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3366 .init = rt_secret_timer_init,
3367 .exit = rt_secret_timer_exit,
3368};
3369
3370
Linus Torvalds1da177e2005-04-16 15:20:36 -07003371#ifdef CONFIG_NET_CLS_ROUTE
Tejun Heo7d720c32010-02-16 15:20:26 +00003372struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003373#endif /* CONFIG_NET_CLS_ROUTE */
3374
3375static __initdata unsigned long rhash_entries;
3376static int __init set_rhash_entries(char *str)
3377{
3378 if (!str)
3379 return 0;
3380 rhash_entries = simple_strtoul(str, &str, 0);
3381 return 1;
3382}
3383__setup("rhash_entries=", set_rhash_entries);
3384
3385int __init ip_rt_init(void)
3386{
Eric Dumazet424c4b72005-07-05 14:58:19 -07003387 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003388
Linus Torvalds1da177e2005-04-16 15:20:36 -07003389#ifdef CONFIG_NET_CLS_ROUTE
Ingo Molnar0dcec8c2009-02-25 14:07:33 +01003390 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003391 if (!ip_rt_acct)
3392 panic("IP: failed to allocate ip_rt_acct\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003393#endif
3394
Alexey Dobriyane5d679f332006-08-26 19:25:52 -07003395 ipv4_dst_ops.kmem_cachep =
3396 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09003397 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003398
David S. Miller14e50e52007-05-24 18:17:54 -07003399 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3400
Eric Dumazet424c4b72005-07-05 14:58:19 -07003401 rt_hash_table = (struct rt_hash_bucket *)
3402 alloc_large_system_hash("IP route cache",
3403 sizeof(struct rt_hash_bucket),
3404 rhash_entries,
Jan Beulich44813742009-09-21 17:03:05 -07003405 (totalram_pages >= 128 * 1024) ?
Mike Stroyan18955cf2005-11-29 16:12:55 -08003406 15 : 17,
Kirill Korotaev8d1502d2006-08-07 20:44:22 -07003407 0,
Eric Dumazet424c4b72005-07-05 14:58:19 -07003408 &rt_hash_log,
3409 &rt_hash_mask,
Anton Blanchardc9503e02009-04-27 05:42:24 -07003410 rhash_entries ? 0 : 512 * 1024);
Eric Dumazet22c047c2005-07-05 14:55:24 -07003411 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3412 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003413
3414 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3415 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3416
Linus Torvalds1da177e2005-04-16 15:20:36 -07003417 devinet_init();
3418 ip_fib_init();
3419
Linus Torvalds1da177e2005-04-16 15:20:36 -07003420 /* All the timers, started at system startup tend
3421 to synchronize. Perturb it a bit.
3422 */
Eric Dumazet125bb8f2009-06-11 20:10:07 +00003423 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3424 expires_ljiffies = jiffies;
Eric Dumazet39c90ec2007-09-15 10:55:54 -07003425 schedule_delayed_work(&expires_work,
3426 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003427
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003428 if (register_pernet_subsys(&rt_secret_timer_ops))
3429 printk(KERN_ERR "Unable to setup rt_secret_timer\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003430
Denis V. Lunev73b38712008-02-28 20:51:18 -08003431 if (ip_rt_proc_init())
Pavel Emelyanov107f1632007-12-05 21:14:28 -08003432 printk(KERN_ERR "Unable to create route proc files\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003433#ifdef CONFIG_XFRM
3434 xfrm_init();
Neil Hormana33bc5c2009-07-30 18:52:15 -07003435 xfrm4_init(ip_rt_max_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003436#endif
Thomas Graf63f34442007-03-22 11:55:17 -07003437 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3438
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003439#ifdef CONFIG_SYSCTL
3440 register_pernet_subsys(&sysctl_route_ops);
3441#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07003442 return rc;
3443}
3444
Al Viroa1bc6eb2008-07-30 06:32:52 -04003445#ifdef CONFIG_SYSCTL
Al Viroeeb61f72008-07-27 08:59:33 +01003446/*
3447 * We really need to sanitize the damn ipv4 init order, then all
3448 * this nonsense will go away.
3449 */
3450void __init ip_static_sysctl_init(void)
3451{
Al Viro2f4520d2008-08-25 15:17:44 -07003452 register_sysctl_paths(ipv4_path, ipv4_skeleton);
Al Viroeeb61f72008-07-27 08:59:33 +01003453}
Al Viroa1bc6eb2008-07-30 06:32:52 -04003454#endif
Al Viroeeb61f72008-07-27 08:59:33 +01003455
Linus Torvalds1da177e2005-04-16 15:20:36 -07003456EXPORT_SYMBOL(__ip_select_ident);
3457EXPORT_SYMBOL(ip_route_input);
3458EXPORT_SYMBOL(ip_route_output_key);