blob: 3b110a46362cd664ec178f6a13d0fab8fb99296e [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
Jesper Juhl02c30a82005-05-05 16:16:16 -07008 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -07009 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090021 * Alan Cox : Super /proc >4K
Linus Torvalds1da177e2005-04-16 15:20:36 -070022 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090039 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070055 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Ilia Sotnikovcef26852006-03-25 01:38:55 -080056 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
Linus Torvalds1da177e2005-04-16 15:20:36 -070058 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
Joe Perchesafd465032012-03-12 07:03:32 +000065#define pr_fmt(fmt) "IPv4: " fmt
66
Linus Torvalds1da177e2005-04-16 15:20:36 -070067#include <linux/module.h>
68#include <asm/uaccess.h>
69#include <asm/system.h>
70#include <linux/bitops.h>
71#include <linux/types.h>
72#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070073#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070074#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070075#include <linux/string.h>
76#include <linux/socket.h>
77#include <linux/sockios.h>
78#include <linux/errno.h>
79#include <linux/in.h>
80#include <linux/inet.h>
81#include <linux/netdevice.h>
82#include <linux/proc_fs.h>
83#include <linux/init.h>
Eric Dumazet39c90ec2007-09-15 10:55:54 -070084#include <linux/workqueue.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070085#include <linux/skbuff.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070086#include <linux/inetdevice.h>
87#include <linux/igmp.h>
88#include <linux/pkt_sched.h>
89#include <linux/mroute.h>
90#include <linux/netfilter_ipv4.h>
91#include <linux/random.h>
92#include <linux/jhash.h>
93#include <linux/rcupdate.h>
94#include <linux/times.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090095#include <linux/slab.h>
Stephen Rothwellb9eda062011-12-22 17:03:29 +110096#include <linux/prefetch.h>
Herbert Xu352e5122007-11-13 21:34:06 -080097#include <net/dst.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020098#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070099#include <net/protocol.h>
100#include <net/ip.h>
101#include <net/route.h>
102#include <net/inetpeer.h>
103#include <net/sock.h>
104#include <net/ip_fib.h>
105#include <net/arp.h>
106#include <net/tcp.h>
107#include <net/icmp.h>
108#include <net/xfrm.h>
Tom Tucker8d717402006-07-30 20:43:36 -0700109#include <net/netevent.h>
Thomas Graf63f34442007-03-22 11:55:17 -0700110#include <net/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700111#ifdef CONFIG_SYSCTL
112#include <linux/sysctl.h>
113#endif
David S. Miller6e5714e2011-08-03 20:50:44 -0700114#include <net/secure_seq.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700115
David S. Miller68a5e3d2011-03-11 20:07:33 -0500116#define RT_FL_TOS(oldflp4) \
Julian Anastasovf61759e2011-12-02 11:39:42 +0000117 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700118
119#define IP_MAX_MTU 0xFFF0
120
121#define RT_GC_TIMEOUT (300*HZ)
122
Linus Torvalds1da177e2005-04-16 15:20:36 -0700123static int ip_rt_max_size;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700124static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500125static int ip_rt_gc_interval __read_mostly = 60 * HZ;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700126static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
127static int ip_rt_redirect_number __read_mostly = 9;
128static int ip_rt_redirect_load __read_mostly = HZ / 50;
129static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
130static int ip_rt_error_cost __read_mostly = HZ;
131static int ip_rt_error_burst __read_mostly = 5 * HZ;
132static int ip_rt_gc_elasticity __read_mostly = 8;
133static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
134static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
135static int ip_rt_min_advmss __read_mostly = 256;
Neil Horman1080d702008-10-27 12:28:25 -0700136static int rt_chain_length_max __read_mostly = 20;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700137
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500138static struct delayed_work expires_work;
139static unsigned long expires_ljiffies;
140
Linus Torvalds1da177e2005-04-16 15:20:36 -0700141/*
142 * Interface to generic destination cache.
143 */
144
145static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
David S. Miller0dbaee32010-12-13 12:52:14 -0800146static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
Steffen Klassertebb762f2011-11-23 02:12:51 +0000147static unsigned int ipv4_mtu(const struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148static void ipv4_dst_destroy(struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700149static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150static void ipv4_link_failure(struct sk_buff *skb);
151static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
Daniel Lezcano569d3642008-01-18 03:56:57 -0800152static int rt_garbage_collect(struct dst_ops *ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700153
Eric Dumazet72cdd1d2010-11-11 07:14:07 +0000154static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
155 int how)
156{
157}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700158
David S. Miller62fa8a82011-01-26 20:51:05 -0800159static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
160{
David S. Miller06582542011-01-27 14:58:42 -0800161 struct rtable *rt = (struct rtable *) dst;
162 struct inet_peer *peer;
163 u32 *p = NULL;
David S. Miller62fa8a82011-01-26 20:51:05 -0800164
David S. Miller06582542011-01-27 14:58:42 -0800165 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -0400166 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller06582542011-01-27 14:58:42 -0800167
168 peer = rt->peer;
169 if (peer) {
David S. Miller62fa8a82011-01-26 20:51:05 -0800170 u32 *old_p = __DST_METRICS_PTR(old);
171 unsigned long prev, new;
172
David S. Miller06582542011-01-27 14:58:42 -0800173 p = peer->metrics;
174 if (inet_metrics_new(peer))
175 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
David S. Miller62fa8a82011-01-26 20:51:05 -0800176
177 new = (unsigned long) p;
178 prev = cmpxchg(&dst->_metrics, old, new);
179
180 if (prev != old) {
David S. Miller62fa8a82011-01-26 20:51:05 -0800181 p = __DST_METRICS_PTR(prev);
182 if (prev & DST_METRICS_READ_ONLY)
183 p = NULL;
184 } else {
David S. Miller62fa8a82011-01-26 20:51:05 -0800185 if (rt->fi) {
186 fib_info_put(rt->fi);
187 rt->fi = NULL;
188 }
189 }
190 }
191 return p;
192}
193
David S. Millerd3aaeb32011-07-18 00:40:17 -0700194static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
195
Linus Torvalds1da177e2005-04-16 15:20:36 -0700196static struct dst_ops ipv4_dst_ops = {
197 .family = AF_INET,
Harvey Harrison09640e62009-02-01 00:45:17 -0800198 .protocol = cpu_to_be16(ETH_P_IP),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700199 .gc = rt_garbage_collect,
200 .check = ipv4_dst_check,
David S. Miller0dbaee32010-12-13 12:52:14 -0800201 .default_advmss = ipv4_default_advmss,
Steffen Klassertebb762f2011-11-23 02:12:51 +0000202 .mtu = ipv4_mtu,
David S. Miller62fa8a82011-01-26 20:51:05 -0800203 .cow_metrics = ipv4_cow_metrics,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700204 .destroy = ipv4_dst_destroy,
205 .ifdown = ipv4_dst_ifdown,
206 .negative_advice = ipv4_negative_advice,
207 .link_failure = ipv4_link_failure,
208 .update_pmtu = ip_rt_update_pmtu,
Herbert Xu1ac06e02008-05-20 14:32:14 -0700209 .local_out = __ip_local_out,
David S. Millerd3aaeb32011-07-18 00:40:17 -0700210 .neigh_lookup = ipv4_neigh_lookup,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700211};
212
213#define ECN_OR_COST(class) TC_PRIO_##class
214
Philippe De Muyter4839c522007-07-09 15:32:57 -0700215const __u8 ip_tos2prio[16] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700216 TC_PRIO_BESTEFFORT,
Dan Siemon4a2b9c32011-03-15 13:56:07 +0000217 ECN_OR_COST(BESTEFFORT),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700218 TC_PRIO_BESTEFFORT,
219 ECN_OR_COST(BESTEFFORT),
220 TC_PRIO_BULK,
221 ECN_OR_COST(BULK),
222 TC_PRIO_BULK,
223 ECN_OR_COST(BULK),
224 TC_PRIO_INTERACTIVE,
225 ECN_OR_COST(INTERACTIVE),
226 TC_PRIO_INTERACTIVE,
227 ECN_OR_COST(INTERACTIVE),
228 TC_PRIO_INTERACTIVE_BULK,
229 ECN_OR_COST(INTERACTIVE_BULK),
230 TC_PRIO_INTERACTIVE_BULK,
231 ECN_OR_COST(INTERACTIVE_BULK)
232};
233
234
235/*
236 * Route cache.
237 */
238
239/* The locking scheme is rather straight forward:
240 *
241 * 1) Read-Copy Update protects the buckets of the central route hash.
242 * 2) Only writers remove entries, and they hold the lock
243 * as they look at rtable reference counts.
244 * 3) Only readers acquire references to rtable entries,
245 * they do so with atomic increments and with the
246 * lock held.
247 */
248
249struct rt_hash_bucket {
Eric Dumazet1c317202010-10-25 21:02:07 +0000250 struct rtable __rcu *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700251};
Neil Horman1080d702008-10-27 12:28:25 -0700252
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700253#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
254 defined(CONFIG_PROVE_LOCKING)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700255/*
256 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
257 * The size of this table is a power of two and depends on the number of CPUS.
Ingo Molnar62051202006-07-03 00:24:59 -0700258 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700259 */
Ingo Molnar62051202006-07-03 00:24:59 -0700260#ifdef CONFIG_LOCKDEP
261# define RT_HASH_LOCK_SZ 256
Eric Dumazet22c047c2005-07-05 14:55:24 -0700262#else
Ingo Molnar62051202006-07-03 00:24:59 -0700263# if NR_CPUS >= 32
264# define RT_HASH_LOCK_SZ 4096
265# elif NR_CPUS >= 16
266# define RT_HASH_LOCK_SZ 2048
267# elif NR_CPUS >= 8
268# define RT_HASH_LOCK_SZ 1024
269# elif NR_CPUS >= 4
270# define RT_HASH_LOCK_SZ 512
271# else
272# define RT_HASH_LOCK_SZ 256
273# endif
Eric Dumazet22c047c2005-07-05 14:55:24 -0700274#endif
275
276static spinlock_t *rt_hash_locks;
277# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800278
279static __init void rt_hash_lock_init(void)
280{
281 int i;
282
283 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
284 GFP_KERNEL);
285 if (!rt_hash_locks)
286 panic("IP: failed to allocate rt_hash_locks\n");
287
288 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
289 spin_lock_init(&rt_hash_locks[i]);
290}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700291#else
292# define rt_hash_lock_addr(slot) NULL
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800293
294static inline void rt_hash_lock_init(void)
295{
296}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700297#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700298
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700299static struct rt_hash_bucket *rt_hash_table __read_mostly;
300static unsigned rt_hash_mask __read_mostly;
301static unsigned int rt_hash_log __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700302
Eric Dumazet2f970d82006-01-17 02:54:36 -0800303static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
Eric Dumazet27f39c73e2010-05-19 22:07:23 +0000304#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700305
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700306static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700307 int genid)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700308{
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700309 return jhash_3words((__force u32)daddr, (__force u32)saddr,
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700310 idx, genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800311 & rt_hash_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700312}
313
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700314static inline int rt_genid(struct net *net)
315{
316 return atomic_read(&net->ipv4.rt_genid);
317}
318
Linus Torvalds1da177e2005-04-16 15:20:36 -0700319#ifdef CONFIG_PROC_FS
320struct rt_cache_iter_state {
Denis V. Luneva75e9362008-02-28 20:50:55 -0800321 struct seq_net_private p;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700322 int bucket;
Eric Dumazet29e75252008-01-31 17:05:09 -0800323 int genid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700324};
325
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900326static struct rtable *rt_cache_get_first(struct seq_file *seq)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700327{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900328 struct rt_cache_iter_state *st = seq->private;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700329 struct rtable *r = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700330
331 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
Eric Dumazet33d480c2011-08-11 19:30:52 +0000332 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
Eric Dumazeta6272662008-08-28 01:11:25 -0700333 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700334 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -0800335 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Eric Dumazet29e75252008-01-31 17:05:09 -0800336 while (r) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700337 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
Denis V. Luneva75e9362008-02-28 20:50:55 -0800338 r->rt_genid == st->genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800339 return r;
Changli Gaod8d1f302010-06-10 23:31:35 -0700340 r = rcu_dereference_bh(r->dst.rt_next);
Eric Dumazet29e75252008-01-31 17:05:09 -0800341 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700342 rcu_read_unlock_bh();
343 }
Eric Dumazet29e75252008-01-31 17:05:09 -0800344 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700345}
346
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900347static struct rtable *__rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800348 struct rtable *r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700349{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900350 struct rt_cache_iter_state *st = seq->private;
Eric Dumazeta6272662008-08-28 01:11:25 -0700351
Eric Dumazet1c317202010-10-25 21:02:07 +0000352 r = rcu_dereference_bh(r->dst.rt_next);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700353 while (!r) {
354 rcu_read_unlock_bh();
Eric Dumazeta6272662008-08-28 01:11:25 -0700355 do {
356 if (--st->bucket < 0)
357 return NULL;
Eric Dumazet33d480c2011-08-11 19:30:52 +0000358 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700359 rcu_read_lock_bh();
Eric Dumazet1c317202010-10-25 21:02:07 +0000360 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700361 }
Eric Dumazet1c317202010-10-25 21:02:07 +0000362 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700363}
364
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900365static struct rtable *rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800366 struct rtable *r)
367{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900368 struct rt_cache_iter_state *st = seq->private;
369 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700370 if (dev_net(r->dst.dev) != seq_file_net(seq))
Denis V. Luneva75e9362008-02-28 20:50:55 -0800371 continue;
Denis V. Lunev642d6312008-02-28 20:50:33 -0800372 if (r->rt_genid == st->genid)
373 break;
374 }
375 return r;
376}
377
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900378static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700379{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900380 struct rtable *r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700381
382 if (r)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900383 while (pos && (r = rt_cache_get_next(seq, r)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700384 --pos;
385 return pos ? NULL : r;
386}
387
388static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
389{
Eric Dumazet29e75252008-01-31 17:05:09 -0800390 struct rt_cache_iter_state *st = seq->private;
Eric Dumazet29e75252008-01-31 17:05:09 -0800391 if (*pos)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900392 return rt_cache_get_idx(seq, *pos - 1);
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700393 st->genid = rt_genid(seq_file_net(seq));
Eric Dumazet29e75252008-01-31 17:05:09 -0800394 return SEQ_START_TOKEN;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700395}
396
397static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
398{
Eric Dumazet29e75252008-01-31 17:05:09 -0800399 struct rtable *r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700400
401 if (v == SEQ_START_TOKEN)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900402 r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700403 else
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900404 r = rt_cache_get_next(seq, v);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700405 ++*pos;
406 return r;
407}
408
409static void rt_cache_seq_stop(struct seq_file *seq, void *v)
410{
411 if (v && v != SEQ_START_TOKEN)
412 rcu_read_unlock_bh();
413}
414
415static int rt_cache_seq_show(struct seq_file *seq, void *v)
416{
417 if (v == SEQ_START_TOKEN)
418 seq_printf(seq, "%-127s\n",
419 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
420 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
421 "HHUptod\tSpecDst");
422 else {
423 struct rtable *r = v;
David S. Miller69cce1d2011-07-17 23:09:49 -0700424 struct neighbour *n;
Eric Dumazet218fa902011-11-29 20:05:55 +0000425 int len, HHUptod;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700426
Eric Dumazet218fa902011-11-29 20:05:55 +0000427 rcu_read_lock();
David Miller27217452011-12-02 16:52:08 +0000428 n = dst_get_neighbour_noref(&r->dst);
Eric Dumazet218fa902011-11-29 20:05:55 +0000429 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
430 rcu_read_unlock();
431
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700432 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
433 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
Changli Gaod8d1f302010-06-10 23:31:35 -0700434 r->dst.dev ? r->dst.dev->name : "*",
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700435 (__force u32)r->rt_dst,
436 (__force u32)r->rt_gateway,
Changli Gaod8d1f302010-06-10 23:31:35 -0700437 r->rt_flags, atomic_read(&r->dst.__refcnt),
438 r->dst.__use, 0, (__force u32)r->rt_src,
David S. Miller0dbaee32010-12-13 12:52:14 -0800439 dst_metric_advmss(&r->dst) + 40,
Changli Gaod8d1f302010-06-10 23:31:35 -0700440 dst_metric(&r->dst, RTAX_WINDOW),
441 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
442 dst_metric(&r->dst, RTAX_RTTVAR)),
David S. Miller475949d2011-05-03 19:45:15 -0700443 r->rt_key_tos,
David S. Millerf6b72b62011-07-14 07:53:20 -0700444 -1,
Eric Dumazet218fa902011-11-29 20:05:55 +0000445 HHUptod,
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700446 r->rt_spec_dst, &len);
447
448 seq_printf(seq, "%*s\n", 127 - len, "");
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900449 }
450 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700451}
452
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700453static const struct seq_operations rt_cache_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700454 .start = rt_cache_seq_start,
455 .next = rt_cache_seq_next,
456 .stop = rt_cache_seq_stop,
457 .show = rt_cache_seq_show,
458};
459
460static int rt_cache_seq_open(struct inode *inode, struct file *file)
461{
Denis V. Luneva75e9362008-02-28 20:50:55 -0800462 return seq_open_net(inode, file, &rt_cache_seq_ops,
Pavel Emelyanovcf7732e2007-10-10 02:29:29 -0700463 sizeof(struct rt_cache_iter_state));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700464}
465
Arjan van de Ven9a321442007-02-12 00:55:35 -0800466static const struct file_operations rt_cache_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700467 .owner = THIS_MODULE,
468 .open = rt_cache_seq_open,
469 .read = seq_read,
470 .llseek = seq_lseek,
Denis V. Luneva75e9362008-02-28 20:50:55 -0800471 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700472};
473
474
475static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
476{
477 int cpu;
478
479 if (*pos == 0)
480 return SEQ_START_TOKEN;
481
Rusty Russell0f23174a2008-12-29 12:23:42 +0000482 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700483 if (!cpu_possible(cpu))
484 continue;
485 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800486 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700487 }
488 return NULL;
489}
490
491static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
492{
493 int cpu;
494
Rusty Russell0f23174a2008-12-29 12:23:42 +0000495 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700496 if (!cpu_possible(cpu))
497 continue;
498 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800499 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700500 }
501 return NULL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900502
Linus Torvalds1da177e2005-04-16 15:20:36 -0700503}
504
505static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
506{
507
508}
509
510static int rt_cpu_seq_show(struct seq_file *seq, void *v)
511{
512 struct rt_cache_stat *st = v;
513
514 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700515 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700516 return 0;
517 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900518
Linus Torvalds1da177e2005-04-16 15:20:36 -0700519 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
520 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
Eric Dumazetfc66f952010-10-08 06:37:34 +0000521 dst_entries_get_slow(&ipv4_dst_ops),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700522 st->in_hit,
523 st->in_slow_tot,
524 st->in_slow_mc,
525 st->in_no_route,
526 st->in_brd,
527 st->in_martian_dst,
528 st->in_martian_src,
529
530 st->out_hit,
531 st->out_slow_tot,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900532 st->out_slow_mc,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700533
534 st->gc_total,
535 st->gc_ignored,
536 st->gc_goal_miss,
537 st->gc_dst_overflow,
538 st->in_hlist_search,
539 st->out_hlist_search
540 );
541 return 0;
542}
543
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700544static const struct seq_operations rt_cpu_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700545 .start = rt_cpu_seq_start,
546 .next = rt_cpu_seq_next,
547 .stop = rt_cpu_seq_stop,
548 .show = rt_cpu_seq_show,
549};
550
551
552static int rt_cpu_seq_open(struct inode *inode, struct file *file)
553{
554 return seq_open(file, &rt_cpu_seq_ops);
555}
556
Arjan van de Ven9a321442007-02-12 00:55:35 -0800557static const struct file_operations rt_cpu_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700558 .owner = THIS_MODULE,
559 .open = rt_cpu_seq_open,
560 .read = seq_read,
561 .llseek = seq_lseek,
562 .release = seq_release,
563};
564
Patrick McHardyc7066f72011-01-14 13:36:42 +0100565#ifdef CONFIG_IP_ROUTE_CLASSID
Alexey Dobriyana661c412009-11-25 15:40:35 -0800566static int rt_acct_proc_show(struct seq_file *m, void *v)
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800567{
Alexey Dobriyana661c412009-11-25 15:40:35 -0800568 struct ip_rt_acct *dst, *src;
569 unsigned int i, j;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800570
Alexey Dobriyana661c412009-11-25 15:40:35 -0800571 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
572 if (!dst)
573 return -ENOMEM;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800574
Alexey Dobriyana661c412009-11-25 15:40:35 -0800575 for_each_possible_cpu(i) {
576 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
577 for (j = 0; j < 256; j++) {
578 dst[j].o_bytes += src[j].o_bytes;
579 dst[j].o_packets += src[j].o_packets;
580 dst[j].i_bytes += src[j].i_bytes;
581 dst[j].i_packets += src[j].i_packets;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800582 }
583 }
Alexey Dobriyana661c412009-11-25 15:40:35 -0800584
585 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
586 kfree(dst);
587 return 0;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800588}
Alexey Dobriyana661c412009-11-25 15:40:35 -0800589
590static int rt_acct_proc_open(struct inode *inode, struct file *file)
591{
592 return single_open(file, rt_acct_proc_show, NULL);
593}
594
595static const struct file_operations rt_acct_proc_fops = {
596 .owner = THIS_MODULE,
597 .open = rt_acct_proc_open,
598 .read = seq_read,
599 .llseek = seq_lseek,
600 .release = single_release,
601};
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800602#endif
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800603
Denis V. Lunev73b38712008-02-28 20:51:18 -0800604static int __net_init ip_rt_do_proc_init(struct net *net)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800605{
606 struct proc_dir_entry *pde;
607
608 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
609 &rt_cache_seq_fops);
610 if (!pde)
611 goto err1;
612
Wang Chen77020722008-02-28 14:14:25 -0800613 pde = proc_create("rt_cache", S_IRUGO,
614 net->proc_net_stat, &rt_cpu_seq_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800615 if (!pde)
616 goto err2;
617
Patrick McHardyc7066f72011-01-14 13:36:42 +0100618#ifdef CONFIG_IP_ROUTE_CLASSID
Alexey Dobriyana661c412009-11-25 15:40:35 -0800619 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800620 if (!pde)
621 goto err3;
622#endif
623 return 0;
624
Patrick McHardyc7066f72011-01-14 13:36:42 +0100625#ifdef CONFIG_IP_ROUTE_CLASSID
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800626err3:
627 remove_proc_entry("rt_cache", net->proc_net_stat);
628#endif
629err2:
630 remove_proc_entry("rt_cache", net->proc_net);
631err1:
632 return -ENOMEM;
633}
Denis V. Lunev73b38712008-02-28 20:51:18 -0800634
635static void __net_exit ip_rt_do_proc_exit(struct net *net)
636{
637 remove_proc_entry("rt_cache", net->proc_net_stat);
638 remove_proc_entry("rt_cache", net->proc_net);
Patrick McHardyc7066f72011-01-14 13:36:42 +0100639#ifdef CONFIG_IP_ROUTE_CLASSID
Denis V. Lunev73b38712008-02-28 20:51:18 -0800640 remove_proc_entry("rt_acct", net->proc_net);
Alexey Dobriyan0a931ac2010-01-17 03:32:50 +0000641#endif
Denis V. Lunev73b38712008-02-28 20:51:18 -0800642}
643
644static struct pernet_operations ip_rt_proc_ops __net_initdata = {
645 .init = ip_rt_do_proc_init,
646 .exit = ip_rt_do_proc_exit,
647};
648
649static int __init ip_rt_proc_init(void)
650{
651 return register_pernet_subsys(&ip_rt_proc_ops);
652}
653
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800654#else
Denis V. Lunev73b38712008-02-28 20:51:18 -0800655static inline int ip_rt_proc_init(void)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800656{
657 return 0;
658}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700659#endif /* CONFIG_PROC_FS */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900660
Stephen Hemminger5969f712008-04-10 01:52:09 -0700661static inline void rt_free(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700662{
Changli Gaod8d1f302010-06-10 23:31:35 -0700663 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700664}
665
Stephen Hemminger5969f712008-04-10 01:52:09 -0700666static inline void rt_drop(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700667{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700668 ip_rt_put(rt);
Changli Gaod8d1f302010-06-10 23:31:35 -0700669 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700670}
671
Stephen Hemminger5969f712008-04-10 01:52:09 -0700672static inline int rt_fast_clean(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700673{
674 /* Kill broadcast/multicast entries very aggresively, if they
675 collide in hash table with more useful entries */
676 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
David S. Millerc7537962010-11-11 17:07:48 -0800677 rt_is_input_route(rth) && rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700678}
679
Stephen Hemminger5969f712008-04-10 01:52:09 -0700680static inline int rt_valuable(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700681{
682 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
David S. Miller2c8cec52011-02-09 20:42:07 -0800683 (rth->peer && rth->peer->pmtu_expires);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700684}
685
686static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
687{
688 unsigned long age;
689 int ret = 0;
690
Changli Gaod8d1f302010-06-10 23:31:35 -0700691 if (atomic_read(&rth->dst.__refcnt))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700692 goto out;
693
Changli Gaod8d1f302010-06-10 23:31:35 -0700694 age = jiffies - rth->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700695 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
696 (age <= tmo2 && rt_valuable(rth)))
697 goto out;
698 ret = 1;
699out: return ret;
700}
701
702/* Bits of score are:
703 * 31: very valuable
704 * 30: not quite useless
705 * 29..0: usage counter
706 */
707static inline u32 rt_score(struct rtable *rt)
708{
Changli Gaod8d1f302010-06-10 23:31:35 -0700709 u32 score = jiffies - rt->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700710
711 score = ~score & ~(3<<30);
712
713 if (rt_valuable(rt))
714 score |= (1<<31);
715
David S. Millerc7537962010-11-11 17:07:48 -0800716 if (rt_is_output_route(rt) ||
Linus Torvalds1da177e2005-04-16 15:20:36 -0700717 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
718 score |= (1<<30);
719
720 return score;
721}
722
Neil Horman1080d702008-10-27 12:28:25 -0700723static inline bool rt_caching(const struct net *net)
724{
725 return net->ipv4.current_rt_cache_rebuild_count <=
726 net->ipv4.sysctl_rt_cache_rebuild_count;
727}
728
David S. Miller5e2b61f2011-03-04 21:47:09 -0800729static inline bool compare_hash_inputs(const struct rtable *rt1,
730 const struct rtable *rt2)
Neil Horman1080d702008-10-27 12:28:25 -0700731{
David S. Miller5e2b61f2011-03-04 21:47:09 -0800732 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
733 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
Julian Anastasov97a80412011-08-09 04:01:16 +0000734 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
Neil Horman1080d702008-10-27 12:28:25 -0700735}
736
David S. Miller5e2b61f2011-03-04 21:47:09 -0800737static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700738{
David S. Miller5e2b61f2011-03-04 21:47:09 -0800739 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
740 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
741 (rt1->rt_mark ^ rt2->rt_mark) |
David S. Miller475949d2011-05-03 19:45:15 -0700742 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
Julian Anastasovd547f722011-08-07 22:20:20 -0700743 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
Julian Anastasov97a80412011-08-09 04:01:16 +0000744 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700745}
746
Denis V. Lunevb5921912008-01-22 23:50:25 -0800747static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
748{
Changli Gaod8d1f302010-06-10 23:31:35 -0700749 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
Denis V. Lunevb5921912008-01-22 23:50:25 -0800750}
751
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700752static inline int rt_is_expired(struct rtable *rth)
753{
Changli Gaod8d1f302010-06-10 23:31:35 -0700754 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700755}
756
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800757/*
758 * Perform a full scan of hash table and free all entries.
759 * Can be called by a softirq or a process.
760 * In the later case, we want to be reschedule if necessary
761 */
David S. Miller6561a3b2010-12-19 21:11:20 -0800762static void rt_do_flush(struct net *net, int process_context)
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800763{
764 unsigned int i;
765 struct rtable *rth, *next;
766
767 for (i = 0; i <= rt_hash_mask; i++) {
David S. Miller6561a3b2010-12-19 21:11:20 -0800768 struct rtable __rcu **pprev;
769 struct rtable *list;
770
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800771 if (process_context && need_resched())
772 cond_resched();
Eric Dumazet33d480c2011-08-11 19:30:52 +0000773 rth = rcu_access_pointer(rt_hash_table[i].chain);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800774 if (!rth)
775 continue;
776
777 spin_lock_bh(rt_hash_lock_addr(i));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700778
David S. Miller6561a3b2010-12-19 21:11:20 -0800779 list = NULL;
780 pprev = &rt_hash_table[i].chain;
781 rth = rcu_dereference_protected(*pprev,
Eric Dumazet1c317202010-10-25 21:02:07 +0000782 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700783
David S. Miller6561a3b2010-12-19 21:11:20 -0800784 while (rth) {
785 next = rcu_dereference_protected(rth->dst.rt_next,
786 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700787
David S. Miller6561a3b2010-12-19 21:11:20 -0800788 if (!net ||
789 net_eq(dev_net(rth->dst.dev), net)) {
790 rcu_assign_pointer(*pprev, next);
791 rcu_assign_pointer(rth->dst.rt_next, list);
792 list = rth;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700793 } else {
David S. Miller6561a3b2010-12-19 21:11:20 -0800794 pprev = &rth->dst.rt_next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700795 }
David S. Miller6561a3b2010-12-19 21:11:20 -0800796 rth = next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700797 }
David S. Miller6561a3b2010-12-19 21:11:20 -0800798
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800799 spin_unlock_bh(rt_hash_lock_addr(i));
800
David S. Miller6561a3b2010-12-19 21:11:20 -0800801 for (; list; list = next) {
802 next = rcu_dereference_protected(list->dst.rt_next, 1);
803 rt_free(list);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800804 }
805 }
806}
807
Neil Horman1080d702008-10-27 12:28:25 -0700808/*
809 * While freeing expired entries, we compute average chain length
810 * and standard deviation, using fixed-point arithmetic.
811 * This to have an estimation of rt_chain_length_max
812 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
813 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
814 */
815
816#define FRACT_BITS 3
817#define ONE (1UL << FRACT_BITS)
818
Eric Dumazet98376382010-03-08 03:20:00 +0000819/*
820 * Given a hash chain and an item in this hash chain,
821 * find if a previous entry has the same hash_inputs
822 * (but differs on tos, mark or oif)
823 * Returns 0 if an alias is found.
824 * Returns ONE if rth has no alias before itself.
825 */
826static int has_noalias(const struct rtable *head, const struct rtable *rth)
827{
828 const struct rtable *aux = head;
829
830 while (aux != rth) {
David S. Miller5e2b61f2011-03-04 21:47:09 -0800831 if (compare_hash_inputs(aux, rth))
Eric Dumazet98376382010-03-08 03:20:00 +0000832 return 0;
Eric Dumazet1c317202010-10-25 21:02:07 +0000833 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +0000834 }
835 return ONE;
836}
837
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500838static void rt_check_expire(void)
839{
840 static unsigned int rover;
841 unsigned int i = rover, goal;
842 struct rtable *rth;
843 struct rtable __rcu **rthp;
844 unsigned long samples = 0;
845 unsigned long sum = 0, sum2 = 0;
846 unsigned long delta;
847 u64 mult;
848
849 delta = jiffies - expires_ljiffies;
850 expires_ljiffies = jiffies;
851 mult = ((u64)delta) << rt_hash_log;
852 if (ip_rt_gc_timeout > 1)
853 do_div(mult, ip_rt_gc_timeout);
854 goal = (unsigned int)mult;
855 if (goal > rt_hash_mask)
856 goal = rt_hash_mask + 1;
857 for (; goal > 0; goal--) {
858 unsigned long tmo = ip_rt_gc_timeout;
859 unsigned long length;
860
861 i = (i + 1) & rt_hash_mask;
862 rthp = &rt_hash_table[i].chain;
863
864 if (need_resched())
865 cond_resched();
866
867 samples++;
868
869 if (rcu_dereference_raw(*rthp) == NULL)
870 continue;
871 length = 0;
872 spin_lock_bh(rt_hash_lock_addr(i));
873 while ((rth = rcu_dereference_protected(*rthp,
874 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
875 prefetch(rth->dst.rt_next);
876 if (rt_is_expired(rth)) {
877 *rthp = rth->dst.rt_next;
878 rt_free(rth);
879 continue;
880 }
881 if (rth->dst.expires) {
882 /* Entry is expired even if it is in use */
883 if (time_before_eq(jiffies, rth->dst.expires)) {
884nofree:
885 tmo >>= 1;
886 rthp = &rth->dst.rt_next;
887 /*
888 * We only count entries on
889 * a chain with equal hash inputs once
890 * so that entries for different QOS
891 * levels, and other non-hash input
892 * attributes don't unfairly skew
893 * the length computation
894 */
895 length += has_noalias(rt_hash_table[i].chain, rth);
896 continue;
897 }
898 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
899 goto nofree;
900
901 /* Cleanup aged off entries. */
902 *rthp = rth->dst.rt_next;
903 rt_free(rth);
904 }
905 spin_unlock_bh(rt_hash_lock_addr(i));
906 sum += length;
907 sum2 += length*length;
908 }
909 if (samples) {
910 unsigned long avg = sum / samples;
911 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
912 rt_chain_length_max = max_t(unsigned long,
913 ip_rt_gc_elasticity,
914 (avg + 4*sd) >> FRACT_BITS);
915 }
916 rover = i;
917}
918
919/*
920 * rt_worker_func() is run in process context.
921 * we call rt_check_expire() to scan part of the hash table
922 */
923static void rt_worker_func(struct work_struct *work)
924{
925 rt_check_expire();
926 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
927}
928
Eric Dumazet29e75252008-01-31 17:05:09 -0800929/*
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300930 * Perturbation of rt_genid by a small quantity [1..256]
Eric Dumazet29e75252008-01-31 17:05:09 -0800931 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
932 * many times (2^24) without giving recent rt_genid.
933 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700934 */
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700935static void rt_cache_invalidate(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700936{
Eric Dumazet29e75252008-01-31 17:05:09 -0800937 unsigned char shuffle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700938
Eric Dumazet29e75252008-01-31 17:05:09 -0800939 get_random_bytes(&shuffle, sizeof(shuffle));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700940 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
Steffen Klassert5faa5df2012-03-06 21:20:26 +0000941 inetpeer_invalidate_tree(AF_INET);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700942}
943
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800944/*
Eric Dumazet29e75252008-01-31 17:05:09 -0800945 * delay < 0 : invalidate cache (fast : entries will be deleted later)
946 * delay >= 0 : invalidate & flush cache (can be long)
947 */
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -0700948void rt_cache_flush(struct net *net, int delay)
Eric Dumazet29e75252008-01-31 17:05:09 -0800949{
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700950 rt_cache_invalidate(net);
Eric Dumazet29e75252008-01-31 17:05:09 -0800951 if (delay >= 0)
David S. Miller6561a3b2010-12-19 21:11:20 -0800952 rt_do_flush(net, !in_softirq());
Eric Dumazet29e75252008-01-31 17:05:09 -0800953}
954
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000955/* Flush previous cache invalidated entries from the cache */
David S. Miller6561a3b2010-12-19 21:11:20 -0800956void rt_cache_flush_batch(struct net *net)
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000957{
David S. Miller6561a3b2010-12-19 21:11:20 -0800958 rt_do_flush(net, !in_softirq());
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000959}
960
Neil Horman1080d702008-10-27 12:28:25 -0700961static void rt_emergency_hash_rebuild(struct net *net)
962{
Neil Horman3ee94372010-05-08 01:57:52 -0700963 if (net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +0000964 pr_warn("Route hash chain too long!\n");
Neil Horman3ee94372010-05-08 01:57:52 -0700965 rt_cache_invalidate(net);
Neil Horman1080d702008-10-27 12:28:25 -0700966}
967
Linus Torvalds1da177e2005-04-16 15:20:36 -0700968/*
969 Short description of GC goals.
970
971 We want to build algorithm, which will keep routing cache
972 at some equilibrium point, when number of aged off entries
973 is kept approximately equal to newly generated ones.
974
975 Current expiration strength is variable "expire".
976 We try to adjust it dynamically, so that if networking
977 is idle expires is large enough to keep enough of warm entries,
978 and when load increases it reduces to limit cache size.
979 */
980
Daniel Lezcano569d3642008-01-18 03:56:57 -0800981static int rt_garbage_collect(struct dst_ops *ops)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700982{
983 static unsigned long expire = RT_GC_TIMEOUT;
984 static unsigned long last_gc;
985 static int rover;
986 static int equilibrium;
Eric Dumazet1c317202010-10-25 21:02:07 +0000987 struct rtable *rth;
988 struct rtable __rcu **rthp;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700989 unsigned long now = jiffies;
990 int goal;
Eric Dumazetfc66f952010-10-08 06:37:34 +0000991 int entries = dst_entries_get_fast(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700992
993 /*
994 * Garbage collection is pretty expensive,
995 * do not make it too frequently.
996 */
997
998 RT_CACHE_STAT_INC(gc_total);
999
1000 if (now - last_gc < ip_rt_gc_min_interval &&
Eric Dumazetfc66f952010-10-08 06:37:34 +00001001 entries < ip_rt_max_size) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001002 RT_CACHE_STAT_INC(gc_ignored);
1003 goto out;
1004 }
1005
Eric Dumazetfc66f952010-10-08 06:37:34 +00001006 entries = dst_entries_get_slow(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001007 /* Calculate number of entries, which we want to expire now. */
Eric Dumazetfc66f952010-10-08 06:37:34 +00001008 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001009 if (goal <= 0) {
1010 if (equilibrium < ipv4_dst_ops.gc_thresh)
1011 equilibrium = ipv4_dst_ops.gc_thresh;
Eric Dumazetfc66f952010-10-08 06:37:34 +00001012 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001013 if (goal > 0) {
Eric Dumazetb790ced2007-12-21 01:49:07 -08001014 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +00001015 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001016 }
1017 } else {
1018 /* We are in dangerous area. Try to reduce cache really
1019 * aggressively.
1020 */
Eric Dumazetb790ced2007-12-21 01:49:07 -08001021 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +00001022 equilibrium = entries - goal;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001023 }
1024
1025 if (now - last_gc >= ip_rt_gc_min_interval)
1026 last_gc = now;
1027
1028 if (goal <= 0) {
1029 equilibrium += goal;
1030 goto work_done;
1031 }
1032
1033 do {
1034 int i, k;
1035
1036 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1037 unsigned long tmo = expire;
1038
1039 k = (k + 1) & rt_hash_mask;
1040 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001041 spin_lock_bh(rt_hash_lock_addr(k));
Eric Dumazet1c317202010-10-25 21:02:07 +00001042 while ((rth = rcu_dereference_protected(*rthp,
1043 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001044 if (!rt_is_expired(rth) &&
Eric Dumazet29e75252008-01-31 17:05:09 -08001045 !rt_may_expire(rth, tmo, expire)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001046 tmo >>= 1;
Changli Gaod8d1f302010-06-10 23:31:35 -07001047 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001048 continue;
1049 }
Changli Gaod8d1f302010-06-10 23:31:35 -07001050 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001051 rt_free(rth);
1052 goal--;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001053 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001054 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001055 if (goal <= 0)
1056 break;
1057 }
1058 rover = k;
1059
1060 if (goal <= 0)
1061 goto work_done;
1062
1063 /* Goal is not achieved. We stop process if:
1064
1065 - if expire reduced to zero. Otherwise, expire is halfed.
1066 - if table is not full.
1067 - if we are called from interrupt.
1068 - jiffies check is just fallback/debug loop breaker.
1069 We will not spin here for long time in any case.
1070 */
1071
1072 RT_CACHE_STAT_INC(gc_goal_miss);
1073
1074 if (expire == 0)
1075 break;
1076
1077 expire >>= 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001078
Eric Dumazetfc66f952010-10-08 06:37:34 +00001079 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001080 goto out;
1081 } while (!in_softirq() && time_before_eq(jiffies, now));
1082
Eric Dumazetfc66f952010-10-08 06:37:34 +00001083 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1084 goto out;
1085 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001086 goto out;
1087 if (net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00001088 pr_warn("dst cache overflow\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07001089 RT_CACHE_STAT_INC(gc_dst_overflow);
1090 return 1;
1091
1092work_done:
1093 expire += ip_rt_gc_min_interval;
1094 if (expire > ip_rt_gc_timeout ||
Eric Dumazetfc66f952010-10-08 06:37:34 +00001095 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1096 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001097 expire = ip_rt_gc_timeout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001098out: return 0;
1099}
1100
Eric Dumazet98376382010-03-08 03:20:00 +00001101/*
1102 * Returns number of entries in a hash chain that have different hash_inputs
1103 */
1104static int slow_chain_length(const struct rtable *head)
1105{
1106 int length = 0;
1107 const struct rtable *rth = head;
1108
1109 while (rth) {
1110 length += has_noalias(head, rth);
Eric Dumazet1c317202010-10-25 21:02:07 +00001111 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +00001112 }
1113 return length >> FRACT_BITS;
1114}
1115
David S. Millerd3aaeb32011-07-18 00:40:17 -07001116static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
David Miller3769cff2011-07-11 22:44:24 +00001117{
David S. Millerd3aaeb32011-07-18 00:40:17 -07001118 static const __be32 inaddr_any = 0;
1119 struct net_device *dev = dst->dev;
1120 const __be32 *pkey = daddr;
David S. Miller39232972012-01-26 15:22:32 -05001121 const struct rtable *rt;
David Miller3769cff2011-07-11 22:44:24 +00001122 struct neighbour *n;
1123
David S. Miller39232972012-01-26 15:22:32 -05001124 rt = (const struct rtable *) dst;
1125
David Miller3769cff2011-07-11 22:44:24 +00001126 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
David S. Millerd3aaeb32011-07-18 00:40:17 -07001127 pkey = &inaddr_any;
David S. Miller39232972012-01-26 15:22:32 -05001128 else if (rt->rt_gateway)
1129 pkey = (const __be32 *) &rt->rt_gateway;
David S. Millerd3aaeb32011-07-18 00:40:17 -07001130
David S. Miller80703d22012-02-15 17:48:35 -05001131 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
David S. Millerd3aaeb32011-07-18 00:40:17 -07001132 if (n)
1133 return n;
David Miller32092ec2011-07-25 00:01:41 +00001134 return neigh_create(&arp_tbl, pkey, dev);
David S. Millerd3aaeb32011-07-18 00:40:17 -07001135}
1136
1137static int rt_bind_neighbour(struct rtable *rt)
1138{
1139 struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
David Miller3769cff2011-07-11 22:44:24 +00001140 if (IS_ERR(n))
1141 return PTR_ERR(n);
David S. Miller69cce1d2011-07-17 23:09:49 -07001142 dst_set_neighbour(&rt->dst, n);
David Miller3769cff2011-07-11 22:44:24 +00001143
1144 return 0;
1145}
1146
David S. Millerb23dd4f2011-03-02 14:31:35 -08001147static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1148 struct sk_buff *skb, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001149{
Eric Dumazet1c317202010-10-25 21:02:07 +00001150 struct rtable *rth, *cand;
1151 struct rtable __rcu **rthp, **candp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001152 unsigned long now;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001153 u32 min_score;
1154 int chain_length;
1155 int attempts = !in_softirq();
1156
1157restart:
1158 chain_length = 0;
1159 min_score = ~(u32)0;
1160 cand = NULL;
1161 candp = NULL;
1162 now = jiffies;
1163
Changli Gaod8d1f302010-06-10 23:31:35 -07001164 if (!rt_caching(dev_net(rt->dst.dev))) {
Neil Horman73e42892009-06-20 01:15:16 -07001165 /*
1166 * If we're not caching, just tell the caller we
1167 * were successful and don't touch the route. The
1168 * caller hold the sole reference to the cache entry, and
1169 * it will be released when the caller is done with it.
1170 * If we drop it here, the callers have no way to resolve routes
1171 * when we're not caching. Instead, just point *rp at rt, so
1172 * the caller gets a single use out of the route
Neil Hormanb6280b42009-06-22 10:18:53 +00001173 * Note that we do rt_free on this new route entry, so that
1174 * once its refcount hits zero, we are still able to reap it
1175 * (Thanks Alexey)
Eric Dumazet27b75c92010-10-15 05:44:11 +00001176 * Note: To avoid expensive rcu stuff for this uncached dst,
1177 * we set DST_NOCACHE so that dst_release() can free dst without
1178 * waiting a grace period.
Neil Horman73e42892009-06-20 01:15:16 -07001179 */
Neil Hormanb6280b42009-06-22 10:18:53 +00001180
Eric Dumazetc7d44262010-10-03 22:17:54 -07001181 rt->dst.flags |= DST_NOCACHE;
David S. Millerc7537962010-11-11 17:07:48 -08001182 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
David Miller3769cff2011-07-11 22:44:24 +00001183 int err = rt_bind_neighbour(rt);
Neil Hormanb6280b42009-06-22 10:18:53 +00001184 if (err) {
1185 if (net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00001186 pr_warn("Neighbour table failure & not caching routes\n");
Eric Dumazet27b75c92010-10-15 05:44:11 +00001187 ip_rt_put(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001188 return ERR_PTR(err);
Neil Hormanb6280b42009-06-22 10:18:53 +00001189 }
1190 }
1191
Neil Hormanb6280b42009-06-22 10:18:53 +00001192 goto skip_hashing;
Neil Horman1080d702008-10-27 12:28:25 -07001193 }
1194
Linus Torvalds1da177e2005-04-16 15:20:36 -07001195 rthp = &rt_hash_table[hash].chain;
1196
Eric Dumazet22c047c2005-07-05 14:55:24 -07001197 spin_lock_bh(rt_hash_lock_addr(hash));
Eric Dumazet1c317202010-10-25 21:02:07 +00001198 while ((rth = rcu_dereference_protected(*rthp,
1199 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001200 if (rt_is_expired(rth)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001201 *rthp = rth->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001202 rt_free(rth);
1203 continue;
1204 }
David S. Miller5e2b61f2011-03-04 21:47:09 -08001205 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001206 /* Put it first */
Changli Gaod8d1f302010-06-10 23:31:35 -07001207 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001208 /*
1209 * Since lookup is lockfree, the deletion
1210 * must be visible to another weakly ordered CPU before
1211 * the insertion at the start of the hash chain.
1212 */
Changli Gaod8d1f302010-06-10 23:31:35 -07001213 rcu_assign_pointer(rth->dst.rt_next,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001214 rt_hash_table[hash].chain);
1215 /*
1216 * Since lookup is lockfree, the update writes
1217 * must be ordered for consistency on SMP.
1218 */
1219 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1220
Changli Gaod8d1f302010-06-10 23:31:35 -07001221 dst_use(&rth->dst, now);
Eric Dumazet22c047c2005-07-05 14:55:24 -07001222 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001223
1224 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001225 if (skb)
Changli Gaod8d1f302010-06-10 23:31:35 -07001226 skb_dst_set(skb, &rth->dst);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001227 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001228 }
1229
Changli Gaod8d1f302010-06-10 23:31:35 -07001230 if (!atomic_read(&rth->dst.__refcnt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001231 u32 score = rt_score(rth);
1232
1233 if (score <= min_score) {
1234 cand = rth;
1235 candp = rthp;
1236 min_score = score;
1237 }
1238 }
1239
1240 chain_length++;
1241
Changli Gaod8d1f302010-06-10 23:31:35 -07001242 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001243 }
1244
1245 if (cand) {
1246 /* ip_rt_gc_elasticity used to be average length of chain
1247 * length, when exceeded gc becomes really aggressive.
1248 *
1249 * The second limit is less certain. At the moment it allows
1250 * only 2 entries per bucket. We will see.
1251 */
1252 if (chain_length > ip_rt_gc_elasticity) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001253 *candp = cand->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001254 rt_free(cand);
1255 }
Neil Horman1080d702008-10-27 12:28:25 -07001256 } else {
Eric Dumazet98376382010-03-08 03:20:00 +00001257 if (chain_length > rt_chain_length_max &&
1258 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001259 struct net *net = dev_net(rt->dst.dev);
Neil Horman1080d702008-10-27 12:28:25 -07001260 int num = ++net->ipv4.current_rt_cache_rebuild_count;
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001261 if (!rt_caching(net)) {
Joe Perches058bd4d2012-03-11 18:36:11 +00001262 pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
Changli Gaod8d1f302010-06-10 23:31:35 -07001263 rt->dst.dev->name, num);
Neil Horman1080d702008-10-27 12:28:25 -07001264 }
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001265 rt_emergency_hash_rebuild(net);
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001266 spin_unlock_bh(rt_hash_lock_addr(hash));
1267
David S. Miller5e2b61f2011-03-04 21:47:09 -08001268 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001269 ifindex, rt_genid(net));
1270 goto restart;
Neil Horman1080d702008-10-27 12:28:25 -07001271 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001272 }
1273
1274 /* Try to bind route to arp only if it is output
1275 route or unicast forwarding path.
1276 */
David S. Millerc7537962010-11-11 17:07:48 -08001277 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
David Miller3769cff2011-07-11 22:44:24 +00001278 int err = rt_bind_neighbour(rt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001279 if (err) {
Eric Dumazet22c047c2005-07-05 14:55:24 -07001280 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001281
1282 if (err != -ENOBUFS) {
1283 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001284 return ERR_PTR(err);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001285 }
1286
1287 /* Neighbour tables are full and nothing
1288 can be released. Try to shrink route cache,
1289 it is most likely it holds some neighbour records.
1290 */
1291 if (attempts-- > 0) {
1292 int saved_elasticity = ip_rt_gc_elasticity;
1293 int saved_int = ip_rt_gc_min_interval;
1294 ip_rt_gc_elasticity = 1;
1295 ip_rt_gc_min_interval = 0;
Daniel Lezcano569d3642008-01-18 03:56:57 -08001296 rt_garbage_collect(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001297 ip_rt_gc_min_interval = saved_int;
1298 ip_rt_gc_elasticity = saved_elasticity;
1299 goto restart;
1300 }
1301
1302 if (net_ratelimit())
Joe Perchesafd465032012-03-12 07:03:32 +00001303 pr_warn("Neighbour table overflow\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07001304 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001305 return ERR_PTR(-ENOBUFS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001306 }
1307 }
1308
Changli Gaod8d1f302010-06-10 23:31:35 -07001309 rt->dst.rt_next = rt_hash_table[hash].chain;
Neil Horman1080d702008-10-27 12:28:25 -07001310
Eric Dumazet00269b52008-10-16 14:18:29 -07001311 /*
1312 * Since lookup is lockfree, we must make sure
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001313 * previous writes to rt are committed to memory
Eric Dumazet00269b52008-10-16 14:18:29 -07001314 * before making rt visible to other CPUS.
1315 */
Eric Dumazet1ddbcb02009-05-19 20:14:28 +00001316 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
Neil Horman1080d702008-10-27 12:28:25 -07001317
Eric Dumazet22c047c2005-07-05 14:55:24 -07001318 spin_unlock_bh(rt_hash_lock_addr(hash));
Neil Horman73e42892009-06-20 01:15:16 -07001319
Neil Hormanb6280b42009-06-22 10:18:53 +00001320skip_hashing:
David S. Millerb23dd4f2011-03-02 14:31:35 -08001321 if (skb)
Changli Gaod8d1f302010-06-10 23:31:35 -07001322 skb_dst_set(skb, &rt->dst);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001323 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001324}
1325
David S. Miller6431cbc2011-02-07 20:38:06 -08001326static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1327
1328static u32 rt_peer_genid(void)
1329{
1330 return atomic_read(&__rt_peer_genid);
1331}
1332
David S. Millera48eff12011-05-18 18:42:43 -04001333void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001334{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001335 struct inet_peer *peer;
1336
David S. Millera48eff12011-05-18 18:42:43 -04001337 peer = inet_getpeer_v4(daddr, create);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001338
Eric Dumazet49e8ab02010-08-19 06:10:45 +00001339 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001340 inet_putpeer(peer);
David S. Miller6431cbc2011-02-07 20:38:06 -08001341 else
1342 rt->rt_peer_genid = rt_peer_genid();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001343}
1344
1345/*
1346 * Peer allocation may fail only in serious out-of-memory conditions. However
1347 * we still can generate some output.
1348 * Random ID selection looks a bit dangerous because we have no chances to
1349 * select ID being unique in a reasonable period of time.
1350 * But broken packet identifier may be better than no packet at all.
1351 */
1352static void ip_select_fb_ident(struct iphdr *iph)
1353{
1354 static DEFINE_SPINLOCK(ip_fb_id_lock);
1355 static u32 ip_fallback_id;
1356 u32 salt;
1357
1358 spin_lock_bh(&ip_fb_id_lock);
Al Viroe4485152006-09-26 22:15:01 -07001359 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001360 iph->id = htons(salt & 0xFFFF);
1361 ip_fallback_id = salt;
1362 spin_unlock_bh(&ip_fb_id_lock);
1363}
1364
1365void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1366{
1367 struct rtable *rt = (struct rtable *) dst;
1368
Eric Dumazete688a602011-12-22 04:15:53 +00001369 if (rt && !(rt->dst.flags & DST_NOPEER)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001370 if (rt->peer == NULL)
David S. Millera48eff12011-05-18 18:42:43 -04001371 rt_bind_peer(rt, rt->rt_dst, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001372
1373 /* If peer is attached to destination, it is never detached,
1374 so that we need not to grab a lock to dereference it.
1375 */
1376 if (rt->peer) {
1377 iph->id = htons(inet_getid(rt->peer, more));
1378 return;
1379 }
Eric Dumazete688a602011-12-22 04:15:53 +00001380 } else if (!rt)
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001381 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
Stephen Hemminger9c2b3322005-04-19 22:39:42 -07001382 __builtin_return_address(0));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001383
1384 ip_select_fb_ident(iph);
1385}
Eric Dumazet4bc2f182010-07-09 21:22:10 +00001386EXPORT_SYMBOL(__ip_select_ident);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001387
1388static void rt_del(unsigned hash, struct rtable *rt)
1389{
Eric Dumazet1c317202010-10-25 21:02:07 +00001390 struct rtable __rcu **rthp;
1391 struct rtable *aux;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001392
Eric Dumazet29e75252008-01-31 17:05:09 -08001393 rthp = &rt_hash_table[hash].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001394 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001395 ip_rt_put(rt);
Eric Dumazet1c317202010-10-25 21:02:07 +00001396 while ((aux = rcu_dereference_protected(*rthp,
1397 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001398 if (aux == rt || rt_is_expired(aux)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001399 *rthp = aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001400 rt_free(aux);
1401 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001402 }
Changli Gaod8d1f302010-06-10 23:31:35 -07001403 rthp = &aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001404 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001405 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001406}
1407
David S. Millerde398fb2011-12-05 13:21:42 -05001408static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001409{
1410 struct rtable *rt = (struct rtable *) dst;
1411 __be32 orig_gw = rt->rt_gateway;
1412 struct neighbour *n, *old_n;
1413
1414 dst_confirm(&rt->dst);
1415
1416 rt->rt_gateway = peer->redirect_learned.a4;
1417
1418 n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
David S. Millerde398fb2011-12-05 13:21:42 -05001419 if (IS_ERR(n)) {
1420 rt->rt_gateway = orig_gw;
1421 return;
1422 }
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001423 old_n = xchg(&rt->dst._neighbour, n);
1424 if (old_n)
1425 neigh_release(old_n);
David S. Millerde398fb2011-12-05 13:21:42 -05001426 if (!(n->nud_state & NUD_VALID)) {
1427 neigh_event_send(n, NULL);
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001428 } else {
1429 rt->rt_flags |= RTCF_REDIRECTED;
1430 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1431 }
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001432}
1433
Eric Dumazeted7865a42010-06-07 21:49:44 -07001434/* called in rcu_read_lock() section */
Al Virof7655222006-09-26 21:25:43 -07001435void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1436 __be32 saddr, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001437{
Flavio Leitner7cc91502011-10-24 02:56:38 -04001438 int s, i;
Eric Dumazeted7865a42010-06-07 21:49:44 -07001439 struct in_device *in_dev = __in_dev_get_rcu(dev);
Flavio Leitner7cc91502011-10-24 02:56:38 -04001440 __be32 skeys[2] = { saddr, 0 };
1441 int ikeys[2] = { dev->ifindex, 0 };
David S. Millerf39925d2011-02-09 22:00:16 -08001442 struct inet_peer *peer;
Denis V. Lunev317805b2008-02-28 20:50:06 -08001443 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001444
Linus Torvalds1da177e2005-04-16 15:20:36 -07001445 if (!in_dev)
1446 return;
1447
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001448 net = dev_net(dev);
Joe Perches9d4fb272009-11-23 10:41:23 -08001449 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1450 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1451 ipv4_is_zeronet(new_gw))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001452 goto reject_redirect;
1453
1454 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1455 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1456 goto reject_redirect;
1457 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1458 goto reject_redirect;
1459 } else {
Denis V. Lunev317805b2008-02-28 20:50:06 -08001460 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001461 goto reject_redirect;
1462 }
1463
Flavio Leitner7cc91502011-10-24 02:56:38 -04001464 for (s = 0; s < 2; s++) {
1465 for (i = 0; i < 2; i++) {
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001466 unsigned int hash;
1467 struct rtable __rcu **rthp;
1468 struct rtable *rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001469
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001470 hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1471
1472 rthp = &rt_hash_table[hash].chain;
1473
1474 while ((rt = rcu_dereference(*rthp)) != NULL) {
1475 rthp = &rt->dst.rt_next;
1476
1477 if (rt->rt_key_dst != daddr ||
1478 rt->rt_key_src != skeys[s] ||
1479 rt->rt_oif != ikeys[i] ||
1480 rt_is_input_route(rt) ||
1481 rt_is_expired(rt) ||
1482 !net_eq(dev_net(rt->dst.dev), net) ||
1483 rt->dst.error ||
1484 rt->dst.dev != dev ||
1485 rt->rt_gateway != old_gw)
1486 continue;
1487
1488 if (!rt->peer)
1489 rt_bind_peer(rt, rt->rt_dst, 1);
1490
1491 peer = rt->peer;
1492 if (peer) {
Steffen Klassertac3f48d2012-03-06 21:21:10 +00001493 if (peer->redirect_learned.a4 != new_gw) {
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001494 peer->redirect_learned.a4 = new_gw;
1495 atomic_inc(&__rt_peer_genid);
1496 }
1497 check_peer_redir(&rt->dst, peer);
1498 }
Flavio Leitner7cc91502011-10-24 02:56:38 -04001499 }
Flavio Leitner7cc91502011-10-24 02:56:38 -04001500 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001501 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001502 return;
1503
1504reject_redirect:
1505#ifdef CONFIG_IP_ROUTE_VERBOSE
1506 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00001507 pr_info("Redirect from %pI4 on %s about %pI4 ignored\n"
Harvey Harrison673d57e2008-10-31 00:53:57 -07001508 " Advised path = %pI4 -> %pI4\n",
Joe Perches058bd4d2012-03-11 18:36:11 +00001509 &old_gw, dev->name, &new_gw,
1510 &saddr, &daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001511#endif
Eric Dumazeted7865a42010-06-07 21:49:44 -07001512 ;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001513}
1514
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001515static bool peer_pmtu_expired(struct inet_peer *peer)
1516{
1517 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1518
1519 return orig &&
1520 time_after_eq(jiffies, orig) &&
1521 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1522}
1523
1524static bool peer_pmtu_cleaned(struct inet_peer *peer)
1525{
1526 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1527
1528 return orig &&
1529 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1530}
1531
Linus Torvalds1da177e2005-04-16 15:20:36 -07001532static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1533{
Eric Dumazetee6b9672008-03-05 18:30:47 -08001534 struct rtable *rt = (struct rtable *)dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001535 struct dst_entry *ret = dst;
1536
1537 if (rt) {
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001538 if (dst->obsolete > 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001539 ip_rt_put(rt);
1540 ret = NULL;
David S. Miller2c8cec52011-02-09 20:42:07 -08001541 } else if (rt->rt_flags & RTCF_REDIRECTED) {
David S. Miller5e2b61f2011-03-04 21:47:09 -08001542 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1543 rt->rt_oif,
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001544 rt_genid(dev_net(dst->dev)));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001545 rt_del(hash, rt);
1546 ret = NULL;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001547 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1548 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001549 }
1550 }
1551 return ret;
1552}
1553
1554/*
1555 * Algorithm:
1556 * 1. The first ip_rt_redirect_number redirects are sent
1557 * with exponential backoff, then we stop sending them at all,
1558 * assuming that the host ignores our redirects.
1559 * 2. If we did not see packets requiring redirects
1560 * during ip_rt_redirect_silence, we assume that the host
1561 * forgot redirected route and start to send redirects again.
1562 *
1563 * This algorithm is much cheaper and more intelligent than dumb load limiting
1564 * in icmp.c.
1565 *
1566 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1567 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1568 */
1569
1570void ip_rt_send_redirect(struct sk_buff *skb)
1571{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001572 struct rtable *rt = skb_rtable(skb);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001573 struct in_device *in_dev;
David S. Miller92d86822011-02-04 15:55:25 -08001574 struct inet_peer *peer;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001575 int log_martians;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001576
Eric Dumazet30038fc2009-08-28 23:52:01 -07001577 rcu_read_lock();
Changli Gaod8d1f302010-06-10 23:31:35 -07001578 in_dev = __in_dev_get_rcu(rt->dst.dev);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001579 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1580 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001581 return;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001582 }
1583 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1584 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001585
David S. Miller92d86822011-02-04 15:55:25 -08001586 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001587 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller92d86822011-02-04 15:55:25 -08001588 peer = rt->peer;
1589 if (!peer) {
1590 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1591 return;
1592 }
1593
Linus Torvalds1da177e2005-04-16 15:20:36 -07001594 /* No redirected packets during ip_rt_redirect_silence;
1595 * reset the algorithm.
1596 */
David S. Miller92d86822011-02-04 15:55:25 -08001597 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1598 peer->rate_tokens = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001599
1600 /* Too many ignored redirects; do not send anything
Changli Gaod8d1f302010-06-10 23:31:35 -07001601 * set dst.rate_last to the last seen redirected packet.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001602 */
David S. Miller92d86822011-02-04 15:55:25 -08001603 if (peer->rate_tokens >= ip_rt_redirect_number) {
1604 peer->rate_last = jiffies;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001605 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001606 }
1607
1608 /* Check for load limit; set rate_last to the latest sent
1609 * redirect.
1610 */
David S. Miller92d86822011-02-04 15:55:25 -08001611 if (peer->rate_tokens == 0 ||
Li Yewang14fb8a72006-12-18 00:26:35 -08001612 time_after(jiffies,
David S. Miller92d86822011-02-04 15:55:25 -08001613 (peer->rate_last +
1614 (ip_rt_redirect_load << peer->rate_tokens)))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001615 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
David S. Miller92d86822011-02-04 15:55:25 -08001616 peer->rate_last = jiffies;
1617 ++peer->rate_tokens;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001618#ifdef CONFIG_IP_ROUTE_VERBOSE
Eric Dumazet30038fc2009-08-28 23:52:01 -07001619 if (log_martians &&
David S. Miller92d86822011-02-04 15:55:25 -08001620 peer->rate_tokens == ip_rt_redirect_number &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001621 net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00001622 pr_warn("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1623 &ip_hdr(skb)->saddr, rt->rt_iif,
Harvey Harrison673d57e2008-10-31 00:53:57 -07001624 &rt->rt_dst, &rt->rt_gateway);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001625#endif
1626 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001627}
1628
1629static int ip_error(struct sk_buff *skb)
1630{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001631 struct rtable *rt = skb_rtable(skb);
David S. Miller92d86822011-02-04 15:55:25 -08001632 struct inet_peer *peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001633 unsigned long now;
David S. Miller92d86822011-02-04 15:55:25 -08001634 bool send;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001635 int code;
1636
Changli Gaod8d1f302010-06-10 23:31:35 -07001637 switch (rt->dst.error) {
Joe Perches4500ebf2011-07-01 09:43:07 +00001638 case EINVAL:
1639 default:
1640 goto out;
1641 case EHOSTUNREACH:
1642 code = ICMP_HOST_UNREACH;
1643 break;
1644 case ENETUNREACH:
1645 code = ICMP_NET_UNREACH;
1646 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1647 IPSTATS_MIB_INNOROUTES);
1648 break;
1649 case EACCES:
1650 code = ICMP_PKT_FILTERED;
1651 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001652 }
1653
David S. Miller92d86822011-02-04 15:55:25 -08001654 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001655 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller92d86822011-02-04 15:55:25 -08001656 peer = rt->peer;
1657
1658 send = true;
1659 if (peer) {
1660 now = jiffies;
1661 peer->rate_tokens += now - peer->rate_last;
1662 if (peer->rate_tokens > ip_rt_error_burst)
1663 peer->rate_tokens = ip_rt_error_burst;
1664 peer->rate_last = now;
1665 if (peer->rate_tokens >= ip_rt_error_cost)
1666 peer->rate_tokens -= ip_rt_error_cost;
1667 else
1668 send = false;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001669 }
David S. Miller92d86822011-02-04 15:55:25 -08001670 if (send)
1671 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001672
1673out: kfree_skb(skb);
1674 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001675}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001676
1677/*
1678 * The last two values are not from the RFC but
1679 * are needed for AMPRnet AX.25 paths.
1680 */
1681
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001682static const unsigned short mtu_plateau[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -07001683{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1684
Stephen Hemminger5969f712008-04-10 01:52:09 -07001685static inline unsigned short guess_mtu(unsigned short old_mtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001686{
1687 int i;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001688
Linus Torvalds1da177e2005-04-16 15:20:36 -07001689 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1690 if (old_mtu > mtu_plateau[i])
1691 return mtu_plateau[i];
1692 return 68;
1693}
1694
Eric Dumazetb71d1d42011-04-22 04:53:02 +00001695unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
Timo Teras0010e462008-04-29 03:32:25 -07001696 unsigned short new_mtu,
1697 struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001698{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001699 unsigned short old_mtu = ntohs(iph->tot_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001700 unsigned short est_mtu = 0;
David S. Miller2c8cec52011-02-09 20:42:07 -08001701 struct inet_peer *peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001702
David S. Miller2c8cec52011-02-09 20:42:07 -08001703 peer = inet_getpeer_v4(iph->daddr, 1);
1704 if (peer) {
1705 unsigned short mtu = new_mtu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001706
David S. Miller2c8cec52011-02-09 20:42:07 -08001707 if (new_mtu < 68 || new_mtu >= old_mtu) {
1708 /* BSD 4.2 derived systems incorrectly adjust
1709 * tot_len by the IP header length, and report
1710 * a zero MTU in the ICMP message.
1711 */
1712 if (mtu == 0 &&
1713 old_mtu >= 68 + (iph->ihl << 2))
1714 old_mtu -= iph->ihl << 2;
1715 mtu = guess_mtu(old_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001716 }
David S. Miller2c8cec52011-02-09 20:42:07 -08001717
1718 if (mtu < ip_rt_min_pmtu)
1719 mtu = ip_rt_min_pmtu;
1720 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001721 unsigned long pmtu_expires;
1722
1723 pmtu_expires = jiffies + ip_rt_mtu_expires;
1724 if (!pmtu_expires)
1725 pmtu_expires = 1UL;
1726
David S. Miller2c8cec52011-02-09 20:42:07 -08001727 est_mtu = mtu;
1728 peer->pmtu_learned = mtu;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001729 peer->pmtu_expires = pmtu_expires;
Gao feng59445b62011-10-19 15:34:09 +00001730 atomic_inc(&__rt_peer_genid);
David S. Miller2c8cec52011-02-09 20:42:07 -08001731 }
1732
1733 inet_putpeer(peer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001734 }
1735 return est_mtu ? : new_mtu;
1736}
1737
David S. Miller2c8cec52011-02-09 20:42:07 -08001738static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1739{
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001740 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
David S. Miller2c8cec52011-02-09 20:42:07 -08001741
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001742 if (!expires)
1743 return;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001744 if (time_before(jiffies, expires)) {
David S. Miller2c8cec52011-02-09 20:42:07 -08001745 u32 orig_dst_mtu = dst_mtu(dst);
1746 if (peer->pmtu_learned < orig_dst_mtu) {
1747 if (!peer->pmtu_orig)
1748 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1749 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1750 }
1751 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1752 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1753}
1754
Linus Torvalds1da177e2005-04-16 15:20:36 -07001755static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1756{
David S. Miller2c8cec52011-02-09 20:42:07 -08001757 struct rtable *rt = (struct rtable *) dst;
1758 struct inet_peer *peer;
1759
1760 dst_confirm(dst);
1761
1762 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001763 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller2c8cec52011-02-09 20:42:07 -08001764 peer = rt->peer;
1765 if (peer) {
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001766 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1767
David S. Miller2c8cec52011-02-09 20:42:07 -08001768 if (mtu < ip_rt_min_pmtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001769 mtu = ip_rt_min_pmtu;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001770 if (!pmtu_expires || mtu < peer->pmtu_learned) {
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001771
1772 pmtu_expires = jiffies + ip_rt_mtu_expires;
1773 if (!pmtu_expires)
1774 pmtu_expires = 1UL;
1775
David S. Miller2c8cec52011-02-09 20:42:07 -08001776 peer->pmtu_learned = mtu;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001777 peer->pmtu_expires = pmtu_expires;
David S. Miller2c8cec52011-02-09 20:42:07 -08001778
1779 atomic_inc(&__rt_peer_genid);
1780 rt->rt_peer_genid = rt_peer_genid();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001781 }
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001782 check_peer_pmtu(dst, peer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001783 }
1784}
1785
David S. Millerf39925d2011-02-09 22:00:16 -08001786
David S. Millerde398fb2011-12-05 13:21:42 -05001787static void ipv4_validate_peer(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001788{
David S. Miller6431cbc2011-02-07 20:38:06 -08001789 if (rt->rt_peer_genid != rt_peer_genid()) {
David S. Miller2c8cec52011-02-09 20:42:07 -08001790 struct inet_peer *peer;
1791
David S. Miller6431cbc2011-02-07 20:38:06 -08001792 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001793 rt_bind_peer(rt, rt->rt_dst, 0);
David S. Miller6431cbc2011-02-07 20:38:06 -08001794
David S. Miller2c8cec52011-02-09 20:42:07 -08001795 peer = rt->peer;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001796 if (peer) {
David S. Millerefbc3682011-12-01 13:38:59 -05001797 check_peer_pmtu(&rt->dst, peer);
David S. Miller2c8cec52011-02-09 20:42:07 -08001798
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001799 if (peer->redirect_learned.a4 &&
David S. Millerde398fb2011-12-05 13:21:42 -05001800 peer->redirect_learned.a4 != rt->rt_gateway)
1801 check_peer_redir(&rt->dst, peer);
David S. Millerf39925d2011-02-09 22:00:16 -08001802 }
1803
David S. Miller6431cbc2011-02-07 20:38:06 -08001804 rt->rt_peer_genid = rt_peer_genid();
1805 }
David S. Millerefbc3682011-12-01 13:38:59 -05001806}
1807
1808static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1809{
1810 struct rtable *rt = (struct rtable *) dst;
1811
1812 if (rt_is_expired(rt))
1813 return NULL;
David S. Millerde398fb2011-12-05 13:21:42 -05001814 ipv4_validate_peer(rt);
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001815 return dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001816}
1817
1818static void ipv4_dst_destroy(struct dst_entry *dst)
1819{
1820 struct rtable *rt = (struct rtable *) dst;
1821 struct inet_peer *peer = rt->peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001822
David S. Miller62fa8a82011-01-26 20:51:05 -08001823 if (rt->fi) {
1824 fib_info_put(rt->fi);
1825 rt->fi = NULL;
1826 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001827 if (peer) {
1828 rt->peer = NULL;
1829 inet_putpeer(peer);
1830 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001831}
1832
Linus Torvalds1da177e2005-04-16 15:20:36 -07001833
1834static void ipv4_link_failure(struct sk_buff *skb)
1835{
1836 struct rtable *rt;
1837
1838 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1839
Eric Dumazet511c3f92009-06-02 05:14:27 +00001840 rt = skb_rtable(skb);
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001841 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1842 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001843}
1844
1845static int ip_rt_bug(struct sk_buff *skb)
1846{
Harvey Harrison673d57e2008-10-31 00:53:57 -07001847 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1848 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001849 skb->dev ? skb->dev->name : "?");
1850 kfree_skb(skb);
Dave Jonesc378a9c2011-05-21 07:16:42 +00001851 WARN_ON(1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001852 return 0;
1853}
1854
1855/*
1856 We do not cache source address of outgoing interface,
1857 because it is used only by IP RR, TS and SRR options,
1858 so that it out of fast path.
1859
1860 BTW remember: "addr" is allowed to be not aligned
1861 in IP options!
1862 */
1863
David S. Miller8e363602011-05-13 17:29:41 -04001864void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001865{
Al Viroa61ced52006-09-26 21:27:54 -07001866 __be32 src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001867
David S. Millerc7537962010-11-11 17:07:48 -08001868 if (rt_is_output_route(rt))
David S. Millerc5be24f2011-05-13 18:01:21 -04001869 src = ip_hdr(skb)->saddr;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001870 else {
David S. Miller8e363602011-05-13 17:29:41 -04001871 struct fib_result res;
1872 struct flowi4 fl4;
1873 struct iphdr *iph;
1874
1875 iph = ip_hdr(skb);
1876
1877 memset(&fl4, 0, sizeof(fl4));
1878 fl4.daddr = iph->daddr;
1879 fl4.saddr = iph->saddr;
Julian Anastasovb0fe4a32011-07-23 02:00:41 +00001880 fl4.flowi4_tos = RT_TOS(iph->tos);
David S. Miller8e363602011-05-13 17:29:41 -04001881 fl4.flowi4_oif = rt->dst.dev->ifindex;
1882 fl4.flowi4_iif = skb->dev->ifindex;
1883 fl4.flowi4_mark = skb->mark;
David S. Miller5e2b61f2011-03-04 21:47:09 -08001884
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001885 rcu_read_lock();
David S. Miller68a5e3d2011-03-11 20:07:33 -05001886 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
David S. Miller436c3b62011-03-24 17:42:21 -07001887 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001888 else
1889 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001890 RT_SCOPE_UNIVERSE);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001891 rcu_read_unlock();
1892 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001893 memcpy(addr, &src, 4);
1894}
1895
Patrick McHardyc7066f72011-01-14 13:36:42 +01001896#ifdef CONFIG_IP_ROUTE_CLASSID
Linus Torvalds1da177e2005-04-16 15:20:36 -07001897static void set_class_tag(struct rtable *rt, u32 tag)
1898{
Changli Gaod8d1f302010-06-10 23:31:35 -07001899 if (!(rt->dst.tclassid & 0xFFFF))
1900 rt->dst.tclassid |= tag & 0xFFFF;
1901 if (!(rt->dst.tclassid & 0xFFFF0000))
1902 rt->dst.tclassid |= tag & 0xFFFF0000;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001903}
1904#endif
1905
David S. Miller0dbaee32010-12-13 12:52:14 -08001906static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1907{
1908 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1909
1910 if (advmss == 0) {
1911 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1912 ip_rt_min_advmss);
1913 if (advmss > 65535 - 40)
1914 advmss = 65535 - 40;
1915 }
1916 return advmss;
1917}
1918
Steffen Klassertebb762f2011-11-23 02:12:51 +00001919static unsigned int ipv4_mtu(const struct dst_entry *dst)
David S. Millerd33e4552010-12-14 13:01:14 -08001920{
Steffen Klassert261663b2011-11-23 02:14:50 +00001921 const struct rtable *rt = (const struct rtable *) dst;
Steffen Klassert618f9bc2011-11-23 02:13:31 +00001922 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1923
Steffen Klassert261663b2011-11-23 02:14:50 +00001924 if (mtu && rt_is_output_route(rt))
Steffen Klassert618f9bc2011-11-23 02:13:31 +00001925 return mtu;
1926
1927 mtu = dst->dev->mtu;
David S. Millerd33e4552010-12-14 13:01:14 -08001928
1929 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
David S. Millerd33e4552010-12-14 13:01:14 -08001930
1931 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1932 mtu = 576;
1933 }
1934
1935 if (mtu > IP_MAX_MTU)
1936 mtu = IP_MAX_MTU;
1937
1938 return mtu;
1939}
1940
David S. Miller813b3b52011-04-28 14:48:42 -07001941static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
David S. Miller5e2b61f2011-03-04 21:47:09 -08001942 struct fib_info *fi)
David S. Millera4daad62011-01-27 22:01:53 -08001943{
David S. Miller0131ba42011-02-04 14:37:30 -08001944 struct inet_peer *peer;
1945 int create = 0;
1946
1947 /* If a peer entry exists for this destination, we must hook
1948 * it up in order to get at cached metrics.
1949 */
David S. Miller813b3b52011-04-28 14:48:42 -07001950 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
David S. Miller0131ba42011-02-04 14:37:30 -08001951 create = 1;
1952
David S. Miller3c0afdc2011-03-04 21:26:07 -08001953 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
David S. Miller0131ba42011-02-04 14:37:30 -08001954 if (peer) {
David S. Miller3c0afdc2011-03-04 21:26:07 -08001955 rt->rt_peer_genid = rt_peer_genid();
David S. Miller0131ba42011-02-04 14:37:30 -08001956 if (inet_metrics_new(peer))
1957 memcpy(peer->metrics, fi->fib_metrics,
1958 sizeof(u32) * RTAX_MAX);
1959 dst_init_metrics(&rt->dst, peer->metrics, false);
David S. Miller2c8cec52011-02-09 20:42:07 -08001960
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001961 check_peer_pmtu(&rt->dst, peer);
Steffen Klassertac3f48d2012-03-06 21:21:10 +00001962
David S. Millerf39925d2011-02-09 22:00:16 -08001963 if (peer->redirect_learned.a4 &&
1964 peer->redirect_learned.a4 != rt->rt_gateway) {
1965 rt->rt_gateway = peer->redirect_learned.a4;
1966 rt->rt_flags |= RTCF_REDIRECTED;
1967 }
David S. Miller0131ba42011-02-04 14:37:30 -08001968 } else {
David S. Millerb8dad612011-01-28 14:07:16 -08001969 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1970 rt->fi = fi;
1971 atomic_inc(&fi->fib_clntref);
1972 }
David S. Millera4daad62011-01-27 22:01:53 -08001973 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
David S. Millera4daad62011-01-27 22:01:53 -08001974 }
1975}
1976
David S. Miller813b3b52011-04-28 14:48:42 -07001977static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
David S. Miller5e2b61f2011-03-04 21:47:09 -08001978 const struct fib_result *res,
David S. Miller982721f2011-02-16 21:44:24 -08001979 struct fib_info *fi, u16 type, u32 itag)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001980{
David S. Millerdefb3512010-12-08 21:16:57 -08001981 struct dst_entry *dst = &rt->dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001982
1983 if (fi) {
1984 if (FIB_RES_GW(*res) &&
1985 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1986 rt->rt_gateway = FIB_RES_GW(*res);
David S. Miller813b3b52011-04-28 14:48:42 -07001987 rt_init_metrics(rt, fl4, fi);
Patrick McHardyc7066f72011-01-14 13:36:42 +01001988#ifdef CONFIG_IP_ROUTE_CLASSID
David S. Millerdefb3512010-12-08 21:16:57 -08001989 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001990#endif
David S. Millerd33e4552010-12-14 13:01:14 -08001991 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001992
David S. Millerdefb3512010-12-08 21:16:57 -08001993 if (dst_mtu(dst) > IP_MAX_MTU)
1994 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
David S. Miller0dbaee32010-12-13 12:52:14 -08001995 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
David S. Millerdefb3512010-12-08 21:16:57 -08001996 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001997
Patrick McHardyc7066f72011-01-14 13:36:42 +01001998#ifdef CONFIG_IP_ROUTE_CLASSID
Linus Torvalds1da177e2005-04-16 15:20:36 -07001999#ifdef CONFIG_IP_MULTIPLE_TABLES
2000 set_class_tag(rt, fib_rules_tclass(res));
2001#endif
2002 set_class_tag(rt, itag);
2003#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002004}
2005
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002006static struct rtable *rt_dst_alloc(struct net_device *dev,
2007 bool nopolicy, bool noxfrm)
David S. Miller0c4dcd52011-02-17 15:42:37 -08002008{
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002009 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2010 DST_HOST |
2011 (nopolicy ? DST_NOPOLICY : 0) |
2012 (noxfrm ? DST_NOXFRM : 0));
David S. Miller0c4dcd52011-02-17 15:42:37 -08002013}
2014
Eric Dumazet96d36222010-06-02 19:21:31 +00002015/* called in rcu_read_lock() section */
Al Viro9e12bb22006-09-26 21:25:20 -07002016static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002017 u8 tos, struct net_device *dev, int our)
2018{
Eric Dumazet96d36222010-06-02 19:21:31 +00002019 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002020 struct rtable *rth;
Al Viroa61ced52006-09-26 21:27:54 -07002021 __be32 spec_dst;
Eric Dumazet96d36222010-06-02 19:21:31 +00002022 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002023 u32 itag = 0;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002024 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002025
2026 /* Primary sanity checks. */
2027
2028 if (in_dev == NULL)
2029 return -EINVAL;
2030
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002031 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002032 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002033 goto e_inval;
2034
Joe Perchesf97c1e02007-12-16 13:45:43 -08002035 if (ipv4_is_zeronet(saddr)) {
2036 if (!ipv4_is_local_multicast(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002037 goto e_inval;
2038 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002039 } else {
Michael Smith5c04c812011-04-07 04:51:50 +00002040 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2041 &itag);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002042 if (err < 0)
2043 goto e_err;
2044 }
Benjamin LaHaise4e7b2f12012-03-27 15:55:32 +00002045 rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002046 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002047 if (!rth)
2048 goto e_nobufs;
2049
Patrick McHardyc7066f72011-01-14 13:36:42 +01002050#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07002051 rth->dst.tclassid = itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002052#endif
David S. Millercf911662011-04-28 14:31:47 -07002053 rth->dst.output = ip_rt_bug;
2054
2055 rth->rt_key_dst = daddr;
2056 rth->rt_key_src = saddr;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002057 rth->rt_genid = rt_genid(dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002058 rth->rt_flags = RTCF_MULTICAST;
Eric Dumazet29e75252008-01-31 17:05:09 -08002059 rth->rt_type = RTN_MULTICAST;
David S. Miller475949d2011-05-03 19:45:15 -07002060 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002061 rth->rt_dst = daddr;
2062 rth->rt_src = saddr;
2063 rth->rt_route_iif = dev->ifindex;
2064 rth->rt_iif = dev->ifindex;
2065 rth->rt_oif = 0;
2066 rth->rt_mark = skb->mark;
2067 rth->rt_gateway = daddr;
2068 rth->rt_spec_dst= spec_dst;
2069 rth->rt_peer_genid = 0;
2070 rth->peer = NULL;
2071 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002072 if (our) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002073 rth->dst.input= ip_local_deliver;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002074 rth->rt_flags |= RTCF_LOCAL;
2075 }
2076
2077#ifdef CONFIG_IP_MROUTE
Joe Perchesf97c1e02007-12-16 13:45:43 -08002078 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
Changli Gaod8d1f302010-06-10 23:31:35 -07002079 rth->dst.input = ip_mr_input;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002080#endif
2081 RT_CACHE_STAT_INC(in_slow_mc);
2082
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002083 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
David S. Millerb23dd4f2011-03-02 14:31:35 -08002084 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
Eric Dumazet9aa3c942011-06-18 11:59:18 -07002085 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002086
2087e_nobufs:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002088 return -ENOBUFS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002089e_inval:
Eric Dumazet96d36222010-06-02 19:21:31 +00002090 return -EINVAL;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002091e_err:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002092 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002093}
2094
2095
2096static void ip_handle_martian_source(struct net_device *dev,
2097 struct in_device *in_dev,
2098 struct sk_buff *skb,
Al Viro9e12bb22006-09-26 21:25:20 -07002099 __be32 daddr,
2100 __be32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002101{
2102 RT_CACHE_STAT_INC(in_martian_src);
2103#ifdef CONFIG_IP_ROUTE_VERBOSE
2104 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2105 /*
2106 * RFC1812 recommendation, if source is martian,
2107 * the only hint is MAC header.
2108 */
Joe Perches058bd4d2012-03-11 18:36:11 +00002109 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
Harvey Harrison673d57e2008-10-31 00:53:57 -07002110 &daddr, &saddr, dev->name);
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07002111 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
Joe Perches058bd4d2012-03-11 18:36:11 +00002112 print_hex_dump(KERN_WARNING, "ll header: ",
2113 DUMP_PREFIX_OFFSET, 16, 1,
2114 skb_mac_header(skb),
2115 dev->hard_header_len, true);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002116 }
2117 }
2118#endif
2119}
2120
Eric Dumazet47360222010-06-03 04:13:21 +00002121/* called in rcu_read_lock() section */
Stephen Hemminger5969f712008-04-10 01:52:09 -07002122static int __mkroute_input(struct sk_buff *skb,
David S. Miller982721f2011-02-16 21:44:24 -08002123 const struct fib_result *res,
Stephen Hemminger5969f712008-04-10 01:52:09 -07002124 struct in_device *in_dev,
2125 __be32 daddr, __be32 saddr, u32 tos,
2126 struct rtable **result)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002127{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002128 struct rtable *rth;
2129 int err;
2130 struct in_device *out_dev;
Eric Dumazet47360222010-06-03 04:13:21 +00002131 unsigned int flags = 0;
Al Virod9c9df82006-09-26 21:28:14 -07002132 __be32 spec_dst;
2133 u32 itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002134
2135 /* get a working reference to the output device */
Eric Dumazet47360222010-06-03 04:13:21 +00002136 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002137 if (out_dev == NULL) {
2138 if (net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00002139 pr_crit("Bug in ip_route_input_slow(). Please report.\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07002140 return -EINVAL;
2141 }
2142
2143
Michael Smith5c04c812011-04-07 04:51:50 +00002144 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2145 in_dev->dev, &spec_dst, &itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002146 if (err < 0) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002147 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002148 saddr);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002149
Linus Torvalds1da177e2005-04-16 15:20:36 -07002150 goto cleanup;
2151 }
2152
2153 if (err)
2154 flags |= RTCF_DIRECTSRC;
2155
Thomas Graf51b77ca2008-06-03 16:36:01 -07002156 if (out_dev == in_dev && err &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002157 (IN_DEV_SHARED_MEDIA(out_dev) ||
2158 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2159 flags |= RTCF_DOREDIRECT;
2160
2161 if (skb->protocol != htons(ETH_P_IP)) {
2162 /* Not IP (i.e. ARP). Do not create route, if it is
2163 * invalid for proxy arp. DNAT routes are always valid.
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00002164 *
2165 * Proxy arp feature have been extended to allow, ARP
2166 * replies back to the same interface, to support
2167 * Private VLAN switch technologies. See arp.c.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002168 */
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00002169 if (out_dev == in_dev &&
2170 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002171 err = -EINVAL;
2172 goto cleanup;
2173 }
2174 }
2175
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002176 rth = rt_dst_alloc(out_dev->dev,
2177 IN_DEV_CONF_GET(in_dev, NOPOLICY),
David S. Miller0c4dcd52011-02-17 15:42:37 -08002178 IN_DEV_CONF_GET(out_dev, NOXFRM));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002179 if (!rth) {
2180 err = -ENOBUFS;
2181 goto cleanup;
2182 }
2183
David S. Miller5e2b61f2011-03-04 21:47:09 -08002184 rth->rt_key_dst = daddr;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002185 rth->rt_key_src = saddr;
David S. Millercf911662011-04-28 14:31:47 -07002186 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2187 rth->rt_flags = flags;
2188 rth->rt_type = res->type;
David S. Miller475949d2011-05-03 19:45:15 -07002189 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002190 rth->rt_dst = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002191 rth->rt_src = saddr;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002192 rth->rt_route_iif = in_dev->dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002193 rth->rt_iif = in_dev->dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002194 rth->rt_oif = 0;
David S. Millercf911662011-04-28 14:31:47 -07002195 rth->rt_mark = skb->mark;
2196 rth->rt_gateway = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002197 rth->rt_spec_dst= spec_dst;
David S. Millercf911662011-04-28 14:31:47 -07002198 rth->rt_peer_genid = 0;
2199 rth->peer = NULL;
2200 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002201
Changli Gaod8d1f302010-06-10 23:31:35 -07002202 rth->dst.input = ip_forward;
2203 rth->dst.output = ip_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002204
David S. Miller5e2b61f2011-03-04 21:47:09 -08002205 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002206
Linus Torvalds1da177e2005-04-16 15:20:36 -07002207 *result = rth;
2208 err = 0;
2209 cleanup:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002210 return err;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002211}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002212
Stephen Hemminger5969f712008-04-10 01:52:09 -07002213static int ip_mkroute_input(struct sk_buff *skb,
2214 struct fib_result *res,
David S. Miller68a5e3d2011-03-11 20:07:33 -05002215 const struct flowi4 *fl4,
Stephen Hemminger5969f712008-04-10 01:52:09 -07002216 struct in_device *in_dev,
2217 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002218{
Chuck Short7abaa272005-06-22 22:10:23 -07002219 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002220 int err;
2221 unsigned hash;
2222
2223#ifdef CONFIG_IP_ROUTE_MULTIPATH
David S. Millerff3fccb2011-03-10 16:23:24 -08002224 if (res->fi && res->fi->fib_nhs > 1)
David S. Miller1b7fe5932011-03-10 17:01:16 -08002225 fib_select_multipath(res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002226#endif
2227
2228 /* create a routing cache entry */
2229 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2230 if (err)
2231 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002232
2233 /* put it into the cache */
David S. Miller68a5e3d2011-03-11 20:07:33 -05002234 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
Changli Gaod8d1f302010-06-10 23:31:35 -07002235 rt_genid(dev_net(rth->dst.dev)));
David S. Miller68a5e3d2011-03-11 20:07:33 -05002236 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002237 if (IS_ERR(rth))
2238 return PTR_ERR(rth);
2239 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002240}
2241
Linus Torvalds1da177e2005-04-16 15:20:36 -07002242/*
2243 * NOTE. We drop all the packets that has local source
2244 * addresses, because every properly looped back packet
2245 * must have correct destination already attached by output routine.
2246 *
2247 * Such approach solves two big problems:
2248 * 1. Not simplex devices are handled properly.
2249 * 2. IP spoofing attempts are filtered with 100% of guarantee.
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002250 * called with rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -07002251 */
2252
Al Viro9e12bb22006-09-26 21:25:20 -07002253static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002254 u8 tos, struct net_device *dev)
2255{
2256 struct fib_result res;
Eric Dumazet96d36222010-06-02 19:21:31 +00002257 struct in_device *in_dev = __in_dev_get_rcu(dev);
David S. Miller68a5e3d2011-03-11 20:07:33 -05002258 struct flowi4 fl4;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002259 unsigned flags = 0;
2260 u32 itag = 0;
2261 struct rtable * rth;
2262 unsigned hash;
Al Viro9e12bb22006-09-26 21:25:20 -07002263 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002264 int err = -EINVAL;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002265 struct net * net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002266
2267 /* IP on this device is disabled. */
2268
2269 if (!in_dev)
2270 goto out;
2271
2272 /* Check for the most weird martians, which can be not detected
2273 by fib_lookup.
2274 */
2275
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002276 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002277 ipv4_is_loopback(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002278 goto martian_source;
2279
Andy Walls27a954b2010-10-17 15:11:22 +00002280 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002281 goto brd_input;
2282
2283 /* Accept zero addresses only to limited broadcast;
2284 * I even do not know to fix it or not. Waiting for complains :-)
2285 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002286 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002287 goto martian_source;
2288
Andy Walls27a954b2010-10-17 15:11:22 +00002289 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002290 goto martian_destination;
2291
2292 /*
2293 * Now we are ready to route packet.
2294 */
David S. Miller68a5e3d2011-03-11 20:07:33 -05002295 fl4.flowi4_oif = 0;
2296 fl4.flowi4_iif = dev->ifindex;
2297 fl4.flowi4_mark = skb->mark;
2298 fl4.flowi4_tos = tos;
2299 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2300 fl4.daddr = daddr;
2301 fl4.saddr = saddr;
2302 err = fib_lookup(net, &fl4, &res);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002303 if (err != 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002304 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002305 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002306 goto no_route;
2307 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002308
2309 RT_CACHE_STAT_INC(in_slow_tot);
2310
2311 if (res.type == RTN_BROADCAST)
2312 goto brd_input;
2313
2314 if (res.type == RTN_LOCAL) {
Michael Smith5c04c812011-04-07 04:51:50 +00002315 err = fib_validate_source(skb, saddr, daddr, tos,
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002316 net->loopback_dev->ifindex,
Michael Smith5c04c812011-04-07 04:51:50 +00002317 dev, &spec_dst, &itag);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002318 if (err < 0)
2319 goto martian_source_keep_err;
2320 if (err)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002321 flags |= RTCF_DIRECTSRC;
2322 spec_dst = daddr;
2323 goto local_input;
2324 }
2325
2326 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002327 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002328 if (res.type != RTN_UNICAST)
2329 goto martian_destination;
2330
David S. Miller68a5e3d2011-03-11 20:07:33 -05002331 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002332out: return err;
2333
2334brd_input:
2335 if (skb->protocol != htons(ETH_P_IP))
2336 goto e_inval;
2337
Joe Perchesf97c1e02007-12-16 13:45:43 -08002338 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002339 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2340 else {
Michael Smith5c04c812011-04-07 04:51:50 +00002341 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2342 &itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002343 if (err < 0)
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002344 goto martian_source_keep_err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002345 if (err)
2346 flags |= RTCF_DIRECTSRC;
2347 }
2348 flags |= RTCF_BROADCAST;
2349 res.type = RTN_BROADCAST;
2350 RT_CACHE_STAT_INC(in_brd);
2351
2352local_input:
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002353 rth = rt_dst_alloc(net->loopback_dev,
2354 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002355 if (!rth)
2356 goto e_nobufs;
2357
David S. Millercf911662011-04-28 14:31:47 -07002358 rth->dst.input= ip_local_deliver;
Changli Gaod8d1f302010-06-10 23:31:35 -07002359 rth->dst.output= ip_rt_bug;
David S. Millercf911662011-04-28 14:31:47 -07002360#ifdef CONFIG_IP_ROUTE_CLASSID
2361 rth->dst.tclassid = itag;
2362#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002363
David S. Miller5e2b61f2011-03-04 21:47:09 -08002364 rth->rt_key_dst = daddr;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002365 rth->rt_key_src = saddr;
David S. Millercf911662011-04-28 14:31:47 -07002366 rth->rt_genid = rt_genid(net);
2367 rth->rt_flags = flags|RTCF_LOCAL;
2368 rth->rt_type = res.type;
David S. Miller475949d2011-05-03 19:45:15 -07002369 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002370 rth->rt_dst = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002371 rth->rt_src = saddr;
Patrick McHardyc7066f72011-01-14 13:36:42 +01002372#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07002373 rth->dst.tclassid = itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002374#endif
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002375 rth->rt_route_iif = dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002376 rth->rt_iif = dev->ifindex;
David S. Millercf911662011-04-28 14:31:47 -07002377 rth->rt_oif = 0;
2378 rth->rt_mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002379 rth->rt_gateway = daddr;
2380 rth->rt_spec_dst= spec_dst;
David S. Millercf911662011-04-28 14:31:47 -07002381 rth->rt_peer_genid = 0;
2382 rth->peer = NULL;
2383 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002384 if (res.type == RTN_UNREACHABLE) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002385 rth->dst.input= ip_error;
2386 rth->dst.error= -err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002387 rth->rt_flags &= ~RTCF_LOCAL;
2388 }
David S. Miller68a5e3d2011-03-11 20:07:33 -05002389 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2390 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002391 err = 0;
2392 if (IS_ERR(rth))
2393 err = PTR_ERR(rth);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002394 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002395
2396no_route:
2397 RT_CACHE_STAT_INC(in_no_route);
2398 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2399 res.type = RTN_UNREACHABLE;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08002400 if (err == -ESRCH)
2401 err = -ENETUNREACH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002402 goto local_input;
2403
2404 /*
2405 * Do not cache martian addresses: they should be logged (RFC1812)
2406 */
2407martian_destination:
2408 RT_CACHE_STAT_INC(in_martian_dst);
2409#ifdef CONFIG_IP_ROUTE_VERBOSE
2410 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Joe Perches058bd4d2012-03-11 18:36:11 +00002411 pr_warn("martian destination %pI4 from %pI4, dev %s\n",
Harvey Harrison673d57e2008-10-31 00:53:57 -07002412 &daddr, &saddr, dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002413#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002414
2415e_hostunreach:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002416 err = -EHOSTUNREACH;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002417 goto out;
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002418
Linus Torvalds1da177e2005-04-16 15:20:36 -07002419e_inval:
2420 err = -EINVAL;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002421 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002422
2423e_nobufs:
2424 err = -ENOBUFS;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002425 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002426
2427martian_source:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002428 err = -EINVAL;
2429martian_source_keep_err:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002430 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002431 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002432}
2433
Eric Dumazet407eadd2010-05-10 11:32:55 +00002434int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2435 u8 tos, struct net_device *dev, bool noref)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002436{
2437 struct rtable * rth;
2438 unsigned hash;
2439 int iif = dev->ifindex;
Denis V. Lunevb5921912008-01-22 23:50:25 -08002440 struct net *net;
Eric Dumazet96d36222010-06-02 19:21:31 +00002441 int res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002442
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002443 net = dev_net(dev);
Neil Horman1080d702008-10-27 12:28:25 -07002444
Eric Dumazet96d36222010-06-02 19:21:31 +00002445 rcu_read_lock();
2446
Neil Horman1080d702008-10-27 12:28:25 -07002447 if (!rt_caching(net))
2448 goto skip_cache;
2449
Linus Torvalds1da177e2005-04-16 15:20:36 -07002450 tos &= IPTOS_RT_MASK;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002451 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002452
Linus Torvalds1da177e2005-04-16 15:20:36 -07002453 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002454 rth = rcu_dereference(rth->dst.rt_next)) {
David S. Miller5e2b61f2011-03-04 21:47:09 -08002455 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2456 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
Julian Anastasov97a80412011-08-09 04:01:16 +00002457 (rth->rt_route_iif ^ iif) |
David S. Miller475949d2011-05-03 19:45:15 -07002458 (rth->rt_key_tos ^ tos)) == 0 &&
David S. Miller5e2b61f2011-03-04 21:47:09 -08002459 rth->rt_mark == skb->mark &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002460 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002461 !rt_is_expired(rth)) {
David S. Millerde398fb2011-12-05 13:21:42 -05002462 ipv4_validate_peer(rth);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002463 if (noref) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002464 dst_use_noref(&rth->dst, jiffies);
2465 skb_dst_set_noref(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002466 } else {
Changli Gaod8d1f302010-06-10 23:31:35 -07002467 dst_use(&rth->dst, jiffies);
2468 skb_dst_set(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002469 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002470 RT_CACHE_STAT_INC(in_hit);
2471 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002472 return 0;
2473 }
2474 RT_CACHE_STAT_INC(in_hlist_search);
2475 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002476
Neil Horman1080d702008-10-27 12:28:25 -07002477skip_cache:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002478 /* Multicast recognition logic is moved from route cache to here.
2479 The problem was that too many Ethernet cards have broken/missing
2480 hardware multicast filters :-( As result the host on multicasting
2481 network acquires a lot of useless route cache entries, sort of
2482 SDR messages from all the world. Now we try to get rid of them.
2483 Really, provided software IP multicast filter is organized
2484 reasonably (at least, hashed), it does not result in a slowdown
2485 comparing with route cache reject entries.
2486 Note, that multicast routers are not affected, because
2487 route cache entry is created eventually.
2488 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002489 if (ipv4_is_multicast(daddr)) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002490 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002491
Eric Dumazet96d36222010-06-02 19:21:31 +00002492 if (in_dev) {
David S. Millerdbdd9a52011-03-10 16:34:38 -08002493 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2494 ip_hdr(skb)->protocol);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002495 if (our
2496#ifdef CONFIG_IP_MROUTE
Joe Perches9d4fb272009-11-23 10:41:23 -08002497 ||
2498 (!ipv4_is_local_multicast(daddr) &&
2499 IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002500#endif
Joe Perches9d4fb272009-11-23 10:41:23 -08002501 ) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002502 int res = ip_route_input_mc(skb, daddr, saddr,
2503 tos, dev, our);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002504 rcu_read_unlock();
Eric Dumazet96d36222010-06-02 19:21:31 +00002505 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002506 }
2507 }
2508 rcu_read_unlock();
2509 return -EINVAL;
2510 }
Eric Dumazet96d36222010-06-02 19:21:31 +00002511 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2512 rcu_read_unlock();
2513 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002514}
Eric Dumazet407eadd2010-05-10 11:32:55 +00002515EXPORT_SYMBOL(ip_route_input_common);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002516
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002517/* called with rcu_read_lock() */
David S. Miller982721f2011-02-16 21:44:24 -08002518static struct rtable *__mkroute_output(const struct fib_result *res,
David S. Miller68a5e3d2011-03-11 20:07:33 -05002519 const struct flowi4 *fl4,
David S. Miller813b3b52011-04-28 14:48:42 -07002520 __be32 orig_daddr, __be32 orig_saddr,
Julian Anastasovf61759e2011-12-02 11:39:42 +00002521 int orig_oif, __u8 orig_rtos,
2522 struct net_device *dev_out,
David S. Miller5ada5522011-02-17 15:29:00 -08002523 unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002524{
David S. Miller982721f2011-02-16 21:44:24 -08002525 struct fib_info *fi = res->fi;
David S. Miller5ada5522011-02-17 15:29:00 -08002526 struct in_device *in_dev;
David S. Miller982721f2011-02-16 21:44:24 -08002527 u16 type = res->type;
David S. Miller5ada5522011-02-17 15:29:00 -08002528 struct rtable *rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002529
David S. Miller68a5e3d2011-03-11 20:07:33 -05002530 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
David S. Miller5ada5522011-02-17 15:29:00 -08002531 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002532
David S. Miller68a5e3d2011-03-11 20:07:33 -05002533 if (ipv4_is_lbcast(fl4->daddr))
David S. Miller982721f2011-02-16 21:44:24 -08002534 type = RTN_BROADCAST;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002535 else if (ipv4_is_multicast(fl4->daddr))
David S. Miller982721f2011-02-16 21:44:24 -08002536 type = RTN_MULTICAST;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002537 else if (ipv4_is_zeronet(fl4->daddr))
David S. Miller5ada5522011-02-17 15:29:00 -08002538 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002539
2540 if (dev_out->flags & IFF_LOOPBACK)
2541 flags |= RTCF_LOCAL;
2542
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002543 in_dev = __in_dev_get_rcu(dev_out);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002544 if (!in_dev)
David S. Miller5ada5522011-02-17 15:29:00 -08002545 return ERR_PTR(-EINVAL);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002546
David S. Miller982721f2011-02-16 21:44:24 -08002547 if (type == RTN_BROADCAST) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002548 flags |= RTCF_BROADCAST | RTCF_LOCAL;
David S. Miller982721f2011-02-16 21:44:24 -08002549 fi = NULL;
2550 } else if (type == RTN_MULTICAST) {
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002551 flags |= RTCF_MULTICAST | RTCF_LOCAL;
David S. Miller813b3b52011-04-28 14:48:42 -07002552 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2553 fl4->flowi4_proto))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002554 flags &= ~RTCF_LOCAL;
2555 /* If multicast route do not exist use
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002556 * default one, but do not gateway in this case.
2557 * Yes, it is hack.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002558 */
David S. Miller982721f2011-02-16 21:44:24 -08002559 if (fi && res->prefixlen < 4)
2560 fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002561 }
2562
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002563 rth = rt_dst_alloc(dev_out,
2564 IN_DEV_CONF_GET(in_dev, NOPOLICY),
David S. Miller0c4dcd52011-02-17 15:42:37 -08002565 IN_DEV_CONF_GET(in_dev, NOXFRM));
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002566 if (!rth)
David S. Miller5ada5522011-02-17 15:29:00 -08002567 return ERR_PTR(-ENOBUFS);
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002568
David S. Millercf911662011-04-28 14:31:47 -07002569 rth->dst.output = ip_output;
2570
David S. Miller813b3b52011-04-28 14:48:42 -07002571 rth->rt_key_dst = orig_daddr;
2572 rth->rt_key_src = orig_saddr;
David S. Millercf911662011-04-28 14:31:47 -07002573 rth->rt_genid = rt_genid(dev_net(dev_out));
2574 rth->rt_flags = flags;
2575 rth->rt_type = type;
Julian Anastasovf61759e2011-12-02 11:39:42 +00002576 rth->rt_key_tos = orig_rtos;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002577 rth->rt_dst = fl4->daddr;
2578 rth->rt_src = fl4->saddr;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002579 rth->rt_route_iif = 0;
David S. Miller813b3b52011-04-28 14:48:42 -07002580 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2581 rth->rt_oif = orig_oif;
2582 rth->rt_mark = fl4->flowi4_mark;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002583 rth->rt_gateway = fl4->daddr;
2584 rth->rt_spec_dst= fl4->saddr;
David S. Millercf911662011-04-28 14:31:47 -07002585 rth->rt_peer_genid = 0;
2586 rth->peer = NULL;
2587 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002588
2589 RT_CACHE_STAT_INC(out_slow_tot);
2590
2591 if (flags & RTCF_LOCAL) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002592 rth->dst.input = ip_local_deliver;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002593 rth->rt_spec_dst = fl4->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002594 }
2595 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
David S. Miller68a5e3d2011-03-11 20:07:33 -05002596 rth->rt_spec_dst = fl4->saddr;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002597 if (flags & RTCF_LOCAL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002598 !(dev_out->flags & IFF_LOOPBACK)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002599 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002600 RT_CACHE_STAT_INC(out_slow_mc);
2601 }
2602#ifdef CONFIG_IP_MROUTE
David S. Miller982721f2011-02-16 21:44:24 -08002603 if (type == RTN_MULTICAST) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002604 if (IN_DEV_MFORWARD(in_dev) &&
David S. Miller813b3b52011-04-28 14:48:42 -07002605 !ipv4_is_local_multicast(fl4->daddr)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002606 rth->dst.input = ip_mr_input;
2607 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002608 }
2609 }
2610#endif
2611 }
2612
David S. Miller813b3b52011-04-28 14:48:42 -07002613 rt_set_nexthop(rth, fl4, res, fi, type, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002614
David S. Miller5ada5522011-02-17 15:29:00 -08002615 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002616}
2617
Linus Torvalds1da177e2005-04-16 15:20:36 -07002618/*
2619 * Major route resolver routine.
Eric Dumazet0197aa32010-09-30 03:33:58 +00002620 * called with rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002621 */
2622
David S. Miller813b3b52011-04-28 14:48:42 -07002623static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002624{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002625 struct net_device *dev_out = NULL;
Julian Anastasovf61759e2011-12-02 11:39:42 +00002626 __u8 tos = RT_FL_TOS(fl4);
David S. Miller813b3b52011-04-28 14:48:42 -07002627 unsigned int flags = 0;
2628 struct fib_result res;
David S. Miller5ada5522011-02-17 15:29:00 -08002629 struct rtable *rth;
David S. Miller813b3b52011-04-28 14:48:42 -07002630 __be32 orig_daddr;
2631 __be32 orig_saddr;
2632 int orig_oif;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002633
2634 res.fi = NULL;
2635#ifdef CONFIG_IP_MULTIPLE_TABLES
2636 res.r = NULL;
2637#endif
2638
David S. Miller813b3b52011-04-28 14:48:42 -07002639 orig_daddr = fl4->daddr;
2640 orig_saddr = fl4->saddr;
2641 orig_oif = fl4->flowi4_oif;
2642
2643 fl4->flowi4_iif = net->loopback_dev->ifindex;
2644 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2645 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2646 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
David S. Miller44713b62011-03-04 21:24:47 -08002647
David S. Miller010c2702011-02-17 15:37:09 -08002648 rcu_read_lock();
David S. Miller813b3b52011-04-28 14:48:42 -07002649 if (fl4->saddr) {
David S. Millerb23dd4f2011-03-02 14:31:35 -08002650 rth = ERR_PTR(-EINVAL);
David S. Miller813b3b52011-04-28 14:48:42 -07002651 if (ipv4_is_multicast(fl4->saddr) ||
2652 ipv4_is_lbcast(fl4->saddr) ||
2653 ipv4_is_zeronet(fl4->saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002654 goto out;
2655
Linus Torvalds1da177e2005-04-16 15:20:36 -07002656 /* I removed check for oif == dev_out->oif here.
2657 It was wrong for two reasons:
Denis V. Lunev1ab35272008-01-22 22:04:30 -08002658 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2659 is assigned to multiple interfaces.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002660 2. Moreover, we are allowed to send packets with saddr
2661 of another iface. --ANK
2662 */
2663
David S. Miller813b3b52011-04-28 14:48:42 -07002664 if (fl4->flowi4_oif == 0 &&
2665 (ipv4_is_multicast(fl4->daddr) ||
2666 ipv4_is_lbcast(fl4->daddr))) {
Julian Anastasova210d012008-10-01 07:28:28 -07002667 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
David S. Miller813b3b52011-04-28 14:48:42 -07002668 dev_out = __ip_dev_find(net, fl4->saddr, false);
Julian Anastasova210d012008-10-01 07:28:28 -07002669 if (dev_out == NULL)
2670 goto out;
2671
Linus Torvalds1da177e2005-04-16 15:20:36 -07002672 /* Special hack: user can direct multicasts
2673 and limited broadcast via necessary interface
2674 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2675 This hack is not just for fun, it allows
2676 vic,vat and friends to work.
2677 They bind socket to loopback, set ttl to zero
2678 and expect that it will work.
2679 From the viewpoint of routing cache they are broken,
2680 because we are not allowed to build multicast path
2681 with loopback source addr (look, routing cache
2682 cannot know, that ttl is zero, so that packet
2683 will not leave this host and route is valid).
2684 Luckily, this hack is good workaround.
2685 */
2686
David S. Miller813b3b52011-04-28 14:48:42 -07002687 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002688 goto make_route;
2689 }
Julian Anastasova210d012008-10-01 07:28:28 -07002690
David S. Miller813b3b52011-04-28 14:48:42 -07002691 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
Julian Anastasova210d012008-10-01 07:28:28 -07002692 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
David S. Miller813b3b52011-04-28 14:48:42 -07002693 if (!__ip_dev_find(net, fl4->saddr, false))
Julian Anastasova210d012008-10-01 07:28:28 -07002694 goto out;
Julian Anastasova210d012008-10-01 07:28:28 -07002695 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002696 }
2697
2698
David S. Miller813b3b52011-04-28 14:48:42 -07002699 if (fl4->flowi4_oif) {
2700 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002701 rth = ERR_PTR(-ENODEV);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002702 if (dev_out == NULL)
2703 goto out;
Herbert Xue5ed6392005-10-03 14:35:55 -07002704
2705 /* RACE: Check return value of inet_select_addr instead. */
Eric Dumazetfc75fc82010-12-22 04:39:39 +00002706 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
David S. Millerb23dd4f2011-03-02 14:31:35 -08002707 rth = ERR_PTR(-ENETUNREACH);
Eric Dumazetfc75fc82010-12-22 04:39:39 +00002708 goto out;
2709 }
David S. Miller813b3b52011-04-28 14:48:42 -07002710 if (ipv4_is_local_multicast(fl4->daddr) ||
2711 ipv4_is_lbcast(fl4->daddr)) {
2712 if (!fl4->saddr)
2713 fl4->saddr = inet_select_addr(dev_out, 0,
2714 RT_SCOPE_LINK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002715 goto make_route;
2716 }
David S. Miller813b3b52011-04-28 14:48:42 -07002717 if (fl4->saddr) {
2718 if (ipv4_is_multicast(fl4->daddr))
2719 fl4->saddr = inet_select_addr(dev_out, 0,
2720 fl4->flowi4_scope);
2721 else if (!fl4->daddr)
2722 fl4->saddr = inet_select_addr(dev_out, 0,
2723 RT_SCOPE_HOST);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002724 }
2725 }
2726
David S. Miller813b3b52011-04-28 14:48:42 -07002727 if (!fl4->daddr) {
2728 fl4->daddr = fl4->saddr;
2729 if (!fl4->daddr)
2730 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002731 dev_out = net->loopback_dev;
David S. Miller813b3b52011-04-28 14:48:42 -07002732 fl4->flowi4_oif = net->loopback_dev->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002733 res.type = RTN_LOCAL;
2734 flags |= RTCF_LOCAL;
2735 goto make_route;
2736 }
2737
David S. Miller813b3b52011-04-28 14:48:42 -07002738 if (fib_lookup(net, fl4, &res)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002739 res.fi = NULL;
David S. Miller813b3b52011-04-28 14:48:42 -07002740 if (fl4->flowi4_oif) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002741 /* Apparently, routing tables are wrong. Assume,
2742 that the destination is on link.
2743
2744 WHY? DW.
2745 Because we are allowed to send to iface
2746 even if it has NO routes and NO assigned
2747 addresses. When oif is specified, routing
2748 tables are looked up with only one purpose:
2749 to catch if destination is gatewayed, rather than
2750 direct. Moreover, if MSG_DONTROUTE is set,
2751 we send packet, ignoring both routing tables
2752 and ifaddr state. --ANK
2753
2754
2755 We could make it even if oif is unknown,
2756 likely IPv6, but we do not.
2757 */
2758
David S. Miller813b3b52011-04-28 14:48:42 -07002759 if (fl4->saddr == 0)
2760 fl4->saddr = inet_select_addr(dev_out, 0,
2761 RT_SCOPE_LINK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002762 res.type = RTN_UNICAST;
2763 goto make_route;
2764 }
David S. Millerb23dd4f2011-03-02 14:31:35 -08002765 rth = ERR_PTR(-ENETUNREACH);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002766 goto out;
2767 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002768
2769 if (res.type == RTN_LOCAL) {
David S. Miller813b3b52011-04-28 14:48:42 -07002770 if (!fl4->saddr) {
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002771 if (res.fi->fib_prefsrc)
David S. Miller813b3b52011-04-28 14:48:42 -07002772 fl4->saddr = res.fi->fib_prefsrc;
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002773 else
David S. Miller813b3b52011-04-28 14:48:42 -07002774 fl4->saddr = fl4->daddr;
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002775 }
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002776 dev_out = net->loopback_dev;
David S. Miller813b3b52011-04-28 14:48:42 -07002777 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002778 res.fi = NULL;
2779 flags |= RTCF_LOCAL;
2780 goto make_route;
2781 }
2782
2783#ifdef CONFIG_IP_ROUTE_MULTIPATH
David S. Miller813b3b52011-04-28 14:48:42 -07002784 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
David S. Miller1b7fe5932011-03-10 17:01:16 -08002785 fib_select_multipath(&res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002786 else
2787#endif
David S. Miller21d8c492011-04-14 14:49:37 -07002788 if (!res.prefixlen &&
2789 res.table->tb_num_default > 1 &&
David S. Miller813b3b52011-04-28 14:48:42 -07002790 res.type == RTN_UNICAST && !fl4->flowi4_oif)
David S. Miller0c838ff2011-01-31 16:16:50 -08002791 fib_select_default(&res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002792
David S. Miller813b3b52011-04-28 14:48:42 -07002793 if (!fl4->saddr)
2794 fl4->saddr = FIB_RES_PREFSRC(net, res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002795
Linus Torvalds1da177e2005-04-16 15:20:36 -07002796 dev_out = FIB_RES_DEV(res);
David S. Miller813b3b52011-04-28 14:48:42 -07002797 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002798
2799
2800make_route:
David S. Miller813b3b52011-04-28 14:48:42 -07002801 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
Julian Anastasovf61759e2011-12-02 11:39:42 +00002802 tos, dev_out, flags);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002803 if (!IS_ERR(rth)) {
David S. Miller5ada5522011-02-17 15:29:00 -08002804 unsigned int hash;
2805
David S. Miller813b3b52011-04-28 14:48:42 -07002806 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
David S. Miller5ada5522011-02-17 15:29:00 -08002807 rt_genid(dev_net(dev_out)));
David S. Miller813b3b52011-04-28 14:48:42 -07002808 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
David S. Miller5ada5522011-02-17 15:29:00 -08002809 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002810
David S. Miller010c2702011-02-17 15:37:09 -08002811out:
2812 rcu_read_unlock();
David S. Millerb23dd4f2011-03-02 14:31:35 -08002813 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002814}
2815
David S. Miller813b3b52011-04-28 14:48:42 -07002816struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002817{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002818 struct rtable *rth;
David S. Miller010c2702011-02-17 15:37:09 -08002819 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002820
Neil Horman1080d702008-10-27 12:28:25 -07002821 if (!rt_caching(net))
2822 goto slow_output;
2823
David S. Miller9d6ec932011-03-12 01:12:47 -05002824 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002825
2826 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08002827 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002828 rth = rcu_dereference_bh(rth->dst.rt_next)) {
David S. Miller9d6ec932011-03-12 01:12:47 -05002829 if (rth->rt_key_dst == flp4->daddr &&
2830 rth->rt_key_src == flp4->saddr &&
David S. Millerc7537962010-11-11 17:07:48 -08002831 rt_is_output_route(rth) &&
David S. Miller9d6ec932011-03-12 01:12:47 -05002832 rth->rt_oif == flp4->flowi4_oif &&
2833 rth->rt_mark == flp4->flowi4_mark &&
David S. Miller475949d2011-05-03 19:45:15 -07002834 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
Denis V. Lunevb5921912008-01-22 23:50:25 -08002835 (IPTOS_RT_MASK | RTO_ONLINK)) &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002836 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002837 !rt_is_expired(rth)) {
David S. Millerde398fb2011-12-05 13:21:42 -05002838 ipv4_validate_peer(rth);
Changli Gaod8d1f302010-06-10 23:31:35 -07002839 dst_use(&rth->dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002840 RT_CACHE_STAT_INC(out_hit);
2841 rcu_read_unlock_bh();
David S. Miller56157872011-05-02 14:37:45 -07002842 if (!flp4->saddr)
2843 flp4->saddr = rth->rt_src;
2844 if (!flp4->daddr)
2845 flp4->daddr = rth->rt_dst;
David S. Millerb23dd4f2011-03-02 14:31:35 -08002846 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002847 }
2848 RT_CACHE_STAT_INC(out_hlist_search);
2849 }
2850 rcu_read_unlock_bh();
2851
Neil Horman1080d702008-10-27 12:28:25 -07002852slow_output:
David S. Miller9d6ec932011-03-12 01:12:47 -05002853 return ip_route_output_slow(net, flp4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002854}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002855EXPORT_SYMBOL_GPL(__ip_route_output_key);
2856
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002857static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2858{
2859 return NULL;
2860}
2861
Steffen Klassertebb762f2011-11-23 02:12:51 +00002862static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
Roland Dreierec831ea2011-01-31 13:16:00 -08002863{
Steffen Klassert618f9bc2011-11-23 02:13:31 +00002864 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2865
2866 return mtu ? : dst->dev->mtu;
Roland Dreierec831ea2011-01-31 13:16:00 -08002867}
2868
David S. Miller14e50e52007-05-24 18:17:54 -07002869static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2870{
2871}
2872
Held Bernhard0972ddb2011-04-24 22:07:32 +00002873static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2874 unsigned long old)
2875{
2876 return NULL;
2877}
2878
David S. Miller14e50e52007-05-24 18:17:54 -07002879static struct dst_ops ipv4_dst_blackhole_ops = {
2880 .family = AF_INET,
Harvey Harrison09640e62009-02-01 00:45:17 -08002881 .protocol = cpu_to_be16(ETH_P_IP),
David S. Miller14e50e52007-05-24 18:17:54 -07002882 .destroy = ipv4_dst_destroy,
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002883 .check = ipv4_blackhole_dst_check,
Steffen Klassertebb762f2011-11-23 02:12:51 +00002884 .mtu = ipv4_blackhole_mtu,
Eric Dumazet214f45c2011-02-18 11:39:01 -08002885 .default_advmss = ipv4_default_advmss,
David S. Miller14e50e52007-05-24 18:17:54 -07002886 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
Held Bernhard0972ddb2011-04-24 22:07:32 +00002887 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
David S. Millerd3aaeb32011-07-18 00:40:17 -07002888 .neigh_lookup = ipv4_neigh_lookup,
David S. Miller14e50e52007-05-24 18:17:54 -07002889};
2890
David S. Miller2774c132011-03-01 14:59:04 -08002891struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
David S. Miller14e50e52007-05-24 18:17:54 -07002892{
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002893 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
David S. Miller2774c132011-03-01 14:59:04 -08002894 struct rtable *ort = (struct rtable *) dst_orig;
David S. Miller14e50e52007-05-24 18:17:54 -07002895
2896 if (rt) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002897 struct dst_entry *new = &rt->dst;
David S. Miller14e50e52007-05-24 18:17:54 -07002898
David S. Miller14e50e52007-05-24 18:17:54 -07002899 new->__use = 1;
Herbert Xu352e5122007-11-13 21:34:06 -08002900 new->input = dst_discard;
2901 new->output = dst_discard;
David S. Millerdefb3512010-12-08 21:16:57 -08002902 dst_copy_metrics(new, &ort->dst);
David S. Miller14e50e52007-05-24 18:17:54 -07002903
Changli Gaod8d1f302010-06-10 23:31:35 -07002904 new->dev = ort->dst.dev;
David S. Miller14e50e52007-05-24 18:17:54 -07002905 if (new->dev)
2906 dev_hold(new->dev);
2907
David S. Miller5e2b61f2011-03-04 21:47:09 -08002908 rt->rt_key_dst = ort->rt_key_dst;
2909 rt->rt_key_src = ort->rt_key_src;
David S. Miller475949d2011-05-03 19:45:15 -07002910 rt->rt_key_tos = ort->rt_key_tos;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002911 rt->rt_route_iif = ort->rt_route_iif;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002912 rt->rt_iif = ort->rt_iif;
2913 rt->rt_oif = ort->rt_oif;
2914 rt->rt_mark = ort->rt_mark;
David S. Miller14e50e52007-05-24 18:17:54 -07002915
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002916 rt->rt_genid = rt_genid(net);
David S. Miller14e50e52007-05-24 18:17:54 -07002917 rt->rt_flags = ort->rt_flags;
2918 rt->rt_type = ort->rt_type;
2919 rt->rt_dst = ort->rt_dst;
2920 rt->rt_src = ort->rt_src;
David S. Miller14e50e52007-05-24 18:17:54 -07002921 rt->rt_gateway = ort->rt_gateway;
2922 rt->rt_spec_dst = ort->rt_spec_dst;
2923 rt->peer = ort->peer;
2924 if (rt->peer)
2925 atomic_inc(&rt->peer->refcnt);
David S. Miller62fa8a82011-01-26 20:51:05 -08002926 rt->fi = ort->fi;
2927 if (rt->fi)
2928 atomic_inc(&rt->fi->fib_clntref);
David S. Miller14e50e52007-05-24 18:17:54 -07002929
2930 dst_free(new);
2931 }
2932
David S. Miller2774c132011-03-01 14:59:04 -08002933 dst_release(dst_orig);
2934
2935 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
David S. Miller14e50e52007-05-24 18:17:54 -07002936}
2937
David S. Miller9d6ec932011-03-12 01:12:47 -05002938struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
David S. Millerb23dd4f2011-03-02 14:31:35 -08002939 struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002940{
David S. Miller9d6ec932011-03-12 01:12:47 -05002941 struct rtable *rt = __ip_route_output_key(net, flp4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002942
David S. Millerb23dd4f2011-03-02 14:31:35 -08002943 if (IS_ERR(rt))
2944 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002945
David S. Miller56157872011-05-02 14:37:45 -07002946 if (flp4->flowi4_proto)
David S. Miller9d6ec932011-03-12 01:12:47 -05002947 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2948 flowi4_to_flowi(flp4),
2949 sk, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002950
David S. Millerb23dd4f2011-03-02 14:31:35 -08002951 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002952}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002953EXPORT_SYMBOL_GPL(ip_route_output_flow);
2954
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002955static int rt_fill_info(struct net *net,
2956 struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002957 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002958{
Eric Dumazet511c3f92009-06-02 05:14:27 +00002959 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002960 struct rtmsg *r;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002961 struct nlmsghdr *nlh;
Steffen Klassert2bc8ca42011-10-11 01:12:02 +00002962 unsigned long expires = 0;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00002963 const struct inet_peer *peer = rt->peer;
Thomas Grafe3703b32006-11-27 09:27:07 -08002964 u32 id = 0, ts = 0, tsage = 0, error;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002965
2966 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2967 if (nlh == NULL)
Patrick McHardy26932562007-01-31 23:16:40 -08002968 return -EMSGSIZE;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002969
2970 r = nlmsg_data(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002971 r->rtm_family = AF_INET;
2972 r->rtm_dst_len = 32;
2973 r->rtm_src_len = 0;
David S. Miller475949d2011-05-03 19:45:15 -07002974 r->rtm_tos = rt->rt_key_tos;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002975 r->rtm_table = RT_TABLE_MAIN;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002976 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002977 r->rtm_type = rt->rt_type;
2978 r->rtm_scope = RT_SCOPE_UNIVERSE;
2979 r->rtm_protocol = RTPROT_UNSPEC;
2980 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2981 if (rt->rt_flags & RTCF_NOTIFY)
2982 r->rtm_flags |= RTM_F_NOTIFY;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002983
Al Viro17fb2c62006-09-26 22:15:25 -07002984 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002985
David S. Miller5e2b61f2011-03-04 21:47:09 -08002986 if (rt->rt_key_src) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002987 r->rtm_src_len = 32;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002988 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002989 }
Changli Gaod8d1f302010-06-10 23:31:35 -07002990 if (rt->dst.dev)
2991 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
Patrick McHardyc7066f72011-01-14 13:36:42 +01002992#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07002993 if (rt->dst.tclassid)
2994 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002995#endif
David S. Millerc7537962010-11-11 17:07:48 -08002996 if (rt_is_input_route(rt))
Al Viro17fb2c62006-09-26 22:15:25 -07002997 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
David S. Miller5e2b61f2011-03-04 21:47:09 -08002998 else if (rt->rt_src != rt->rt_key_src)
Al Viro17fb2c62006-09-26 22:15:25 -07002999 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
Thomas Grafbe403ea2006-08-17 18:15:17 -07003000
Linus Torvalds1da177e2005-04-16 15:20:36 -07003001 if (rt->rt_dst != rt->rt_gateway)
Al Viro17fb2c62006-09-26 22:15:25 -07003002 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
Thomas Grafbe403ea2006-08-17 18:15:17 -07003003
David S. Millerdefb3512010-12-08 21:16:57 -08003004 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
Thomas Grafbe403ea2006-08-17 18:15:17 -07003005 goto nla_put_failure;
3006
David S. Miller5e2b61f2011-03-04 21:47:09 -08003007 if (rt->rt_mark)
3008 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
Eric Dumazet963bfee2010-07-20 22:03:14 +00003009
Changli Gaod8d1f302010-06-10 23:31:35 -07003010 error = rt->dst.error;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00003011 if (peer) {
Eric Dumazet317fe0e2010-06-16 04:52:13 +00003012 inet_peer_refcheck(rt->peer);
Eric Dumazetfe6fe792011-06-08 06:07:07 +00003013 id = atomic_read(&peer->ip_id_count) & 0xffff;
3014 if (peer->tcp_ts_stamp) {
3015 ts = peer->tcp_ts;
3016 tsage = get_seconds() - peer->tcp_ts_stamp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003017 }
Eric Dumazetfe6fe792011-06-08 06:07:07 +00003018 expires = ACCESS_ONCE(peer->pmtu_expires);
Steffen Klassert2bc8ca42011-10-11 01:12:02 +00003019 if (expires) {
3020 if (time_before(jiffies, expires))
3021 expires -= jiffies;
3022 else
3023 expires = 0;
3024 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003025 }
Thomas Grafbe403ea2006-08-17 18:15:17 -07003026
David S. Millerc7537962010-11-11 17:07:48 -08003027 if (rt_is_input_route(rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003028#ifdef CONFIG_IP_MROUTE
Al Viroe4485152006-09-26 22:15:01 -07003029 __be32 dst = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003030
Joe Perchesf97c1e02007-12-16 13:45:43 -08003031 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003032 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
David S. Miller9a1b9492011-05-04 12:18:54 -07003033 int err = ipmr_get_route(net, skb,
3034 rt->rt_src, rt->rt_dst,
3035 r, nowait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003036 if (err <= 0) {
3037 if (!nowait) {
3038 if (err == 0)
3039 return 0;
Thomas Grafbe403ea2006-08-17 18:15:17 -07003040 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003041 } else {
3042 if (err == -EMSGSIZE)
Thomas Grafbe403ea2006-08-17 18:15:17 -07003043 goto nla_put_failure;
Thomas Grafe3703b32006-11-27 09:27:07 -08003044 error = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003045 }
3046 }
3047 } else
3048#endif
David S. Miller5e2b61f2011-03-04 21:47:09 -08003049 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003050 }
3051
Changli Gaod8d1f302010-06-10 23:31:35 -07003052 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
Thomas Grafe3703b32006-11-27 09:27:07 -08003053 expires, error) < 0)
3054 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003055
Thomas Grafbe403ea2006-08-17 18:15:17 -07003056 return nlmsg_end(skb, nlh);
3057
3058nla_put_failure:
Patrick McHardy26932562007-01-31 23:16:40 -08003059 nlmsg_cancel(skb, nlh);
3060 return -EMSGSIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003061}
3062
Thomas Graf63f34442007-03-22 11:55:17 -07003063static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003064{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09003065 struct net *net = sock_net(in_skb->sk);
Thomas Grafd889ce32006-08-17 18:15:44 -07003066 struct rtmsg *rtm;
3067 struct nlattr *tb[RTA_MAX+1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07003068 struct rtable *rt = NULL;
Al Viro9e12bb22006-09-26 21:25:20 -07003069 __be32 dst = 0;
3070 __be32 src = 0;
3071 u32 iif;
Thomas Grafd889ce32006-08-17 18:15:44 -07003072 int err;
Eric Dumazet963bfee2010-07-20 22:03:14 +00003073 int mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003074 struct sk_buff *skb;
3075
Thomas Grafd889ce32006-08-17 18:15:44 -07003076 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3077 if (err < 0)
3078 goto errout;
3079
3080 rtm = nlmsg_data(nlh);
3081
Linus Torvalds1da177e2005-04-16 15:20:36 -07003082 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
Thomas Grafd889ce32006-08-17 18:15:44 -07003083 if (skb == NULL) {
3084 err = -ENOBUFS;
3085 goto errout;
3086 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003087
3088 /* Reserve room for dummy headers, this skb can pass
3089 through good chunk of routing engine.
3090 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07003091 skb_reset_mac_header(skb);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07003092 skb_reset_network_header(skb);
Stephen Hemmingerd2c962b2006-04-17 17:27:11 -07003093
3094 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07003095 ip_hdr(skb)->protocol = IPPROTO_ICMP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003096 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3097
Al Viro17fb2c62006-09-26 22:15:25 -07003098 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3099 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
Thomas Grafd889ce32006-08-17 18:15:44 -07003100 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
Eric Dumazet963bfee2010-07-20 22:03:14 +00003101 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003102
3103 if (iif) {
Thomas Grafd889ce32006-08-17 18:15:44 -07003104 struct net_device *dev;
3105
Denis V. Lunev19375042008-02-28 20:52:04 -08003106 dev = __dev_get_by_index(net, iif);
Thomas Grafd889ce32006-08-17 18:15:44 -07003107 if (dev == NULL) {
3108 err = -ENODEV;
3109 goto errout_free;
3110 }
3111
Linus Torvalds1da177e2005-04-16 15:20:36 -07003112 skb->protocol = htons(ETH_P_IP);
3113 skb->dev = dev;
Eric Dumazet963bfee2010-07-20 22:03:14 +00003114 skb->mark = mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003115 local_bh_disable();
3116 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3117 local_bh_enable();
Thomas Grafd889ce32006-08-17 18:15:44 -07003118
Eric Dumazet511c3f92009-06-02 05:14:27 +00003119 rt = skb_rtable(skb);
Changli Gaod8d1f302010-06-10 23:31:35 -07003120 if (err == 0 && rt->dst.error)
3121 err = -rt->dst.error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003122 } else {
David S. Miller68a5e3d2011-03-11 20:07:33 -05003123 struct flowi4 fl4 = {
3124 .daddr = dst,
3125 .saddr = src,
3126 .flowi4_tos = rtm->rtm_tos,
3127 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3128 .flowi4_mark = mark,
Thomas Grafd889ce32006-08-17 18:15:44 -07003129 };
David S. Miller9d6ec932011-03-12 01:12:47 -05003130 rt = ip_route_output_key(net, &fl4);
David S. Millerb23dd4f2011-03-02 14:31:35 -08003131
3132 err = 0;
3133 if (IS_ERR(rt))
3134 err = PTR_ERR(rt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003135 }
Thomas Grafd889ce32006-08-17 18:15:44 -07003136
Linus Torvalds1da177e2005-04-16 15:20:36 -07003137 if (err)
Thomas Grafd889ce32006-08-17 18:15:44 -07003138 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003139
Changli Gaod8d1f302010-06-10 23:31:35 -07003140 skb_dst_set(skb, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003141 if (rtm->rtm_flags & RTM_F_NOTIFY)
3142 rt->rt_flags |= RTCF_NOTIFY;
3143
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003144 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Denis V. Lunev19375042008-02-28 20:52:04 -08003145 RTM_NEWROUTE, 0, 0);
Thomas Grafd889ce32006-08-17 18:15:44 -07003146 if (err <= 0)
3147 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003148
Denis V. Lunev19375042008-02-28 20:52:04 -08003149 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
Thomas Grafd889ce32006-08-17 18:15:44 -07003150errout:
Thomas Graf2942e902006-08-15 00:30:25 -07003151 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003152
Thomas Grafd889ce32006-08-17 18:15:44 -07003153errout_free:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003154 kfree_skb(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07003155 goto errout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003156}
3157
3158int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3159{
3160 struct rtable *rt;
3161 int h, s_h;
3162 int idx, s_idx;
Denis V. Lunev19375042008-02-28 20:52:04 -08003163 struct net *net;
3164
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09003165 net = sock_net(skb->sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003166
3167 s_h = cb->args[0];
Eric Dumazetd8c92832008-01-07 21:52:14 -08003168 if (s_h < 0)
3169 s_h = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003170 s_idx = idx = cb->args[1];
Eric Dumazeta6272662008-08-28 01:11:25 -07003171 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3172 if (!rt_hash_table[h].chain)
3173 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003174 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08003175 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
Changli Gaod8d1f302010-06-10 23:31:35 -07003176 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3177 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003178 continue;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07003179 if (rt_is_expired(rt))
Eric Dumazet29e75252008-01-31 17:05:09 -08003180 continue;
Changli Gaod8d1f302010-06-10 23:31:35 -07003181 skb_dst_set_noref(skb, &rt->dst);
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003182 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003183 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07003184 1, NLM_F_MULTI) <= 0) {
Eric Dumazetadf30902009-06-02 05:19:30 +00003185 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003186 rcu_read_unlock_bh();
3187 goto done;
3188 }
Eric Dumazetadf30902009-06-02 05:19:30 +00003189 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003190 }
3191 rcu_read_unlock_bh();
3192 }
3193
3194done:
3195 cb->args[0] = h;
3196 cb->args[1] = idx;
3197 return skb->len;
3198}
3199
3200void ip_rt_multicast_event(struct in_device *in_dev)
3201{
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -07003202 rt_cache_flush(dev_net(in_dev->dev), 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003203}
3204
3205#ifdef CONFIG_SYSCTL
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003206static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003207 void __user *buffer,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003208 size_t *lenp, loff_t *ppos)
3209{
3210 if (write) {
Denis V. Lunev639e1042008-07-05 19:02:06 -07003211 int flush_delay;
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003212 ctl_table ctl;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003213 struct net *net;
Denis V. Lunev639e1042008-07-05 19:02:06 -07003214
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003215 memcpy(&ctl, __ctl, sizeof(ctl));
3216 ctl.data = &flush_delay;
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003217 proc_dointvec(&ctl, write, buffer, lenp, ppos);
Denis V. Lunev639e1042008-07-05 19:02:06 -07003218
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003219 net = (struct net *)__ctl->extra1;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003220 rt_cache_flush(net, flush_delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003221 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003222 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003223
3224 return -EINVAL;
3225}
3226
Al Viroeeb61f72008-07-27 08:59:33 +01003227static ctl_table ipv4_route_table[] = {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003228 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003229 .procname = "gc_thresh",
3230 .data = &ipv4_dst_ops.gc_thresh,
3231 .maxlen = sizeof(int),
3232 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003233 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003234 },
3235 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003236 .procname = "max_size",
3237 .data = &ip_rt_max_size,
3238 .maxlen = sizeof(int),
3239 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003240 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003241 },
3242 {
3243 /* Deprecated. Use gc_min_interval_ms */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003244
Linus Torvalds1da177e2005-04-16 15:20:36 -07003245 .procname = "gc_min_interval",
3246 .data = &ip_rt_gc_min_interval,
3247 .maxlen = sizeof(int),
3248 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003249 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003250 },
3251 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003252 .procname = "gc_min_interval_ms",
3253 .data = &ip_rt_gc_min_interval,
3254 .maxlen = sizeof(int),
3255 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003256 .proc_handler = proc_dointvec_ms_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003257 },
3258 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003259 .procname = "gc_timeout",
3260 .data = &ip_rt_gc_timeout,
3261 .maxlen = sizeof(int),
3262 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003263 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003264 },
3265 {
Eric Dumazet9f28a2f2011-12-21 15:47:16 -05003266 .procname = "gc_interval",
3267 .data = &ip_rt_gc_interval,
3268 .maxlen = sizeof(int),
3269 .mode = 0644,
3270 .proc_handler = proc_dointvec_jiffies,
3271 },
3272 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003273 .procname = "redirect_load",
3274 .data = &ip_rt_redirect_load,
3275 .maxlen = sizeof(int),
3276 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003277 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003278 },
3279 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003280 .procname = "redirect_number",
3281 .data = &ip_rt_redirect_number,
3282 .maxlen = sizeof(int),
3283 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003284 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003285 },
3286 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003287 .procname = "redirect_silence",
3288 .data = &ip_rt_redirect_silence,
3289 .maxlen = sizeof(int),
3290 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003291 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003292 },
3293 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003294 .procname = "error_cost",
3295 .data = &ip_rt_error_cost,
3296 .maxlen = sizeof(int),
3297 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003298 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003299 },
3300 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003301 .procname = "error_burst",
3302 .data = &ip_rt_error_burst,
3303 .maxlen = sizeof(int),
3304 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003305 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003306 },
3307 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003308 .procname = "gc_elasticity",
3309 .data = &ip_rt_gc_elasticity,
3310 .maxlen = sizeof(int),
3311 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003312 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003313 },
3314 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003315 .procname = "mtu_expires",
3316 .data = &ip_rt_mtu_expires,
3317 .maxlen = sizeof(int),
3318 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003319 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003320 },
3321 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003322 .procname = "min_pmtu",
3323 .data = &ip_rt_min_pmtu,
3324 .maxlen = sizeof(int),
3325 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003326 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003327 },
3328 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003329 .procname = "min_adv_mss",
3330 .data = &ip_rt_min_advmss,
3331 .maxlen = sizeof(int),
3332 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003333 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003334 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003335 { }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003336};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003337
Al Viro2f4520d2008-08-25 15:17:44 -07003338static struct ctl_table empty[1];
3339
3340static struct ctl_table ipv4_skeleton[] =
3341{
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003342 { .procname = "route",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003343 .mode = 0555, .child = ipv4_route_table},
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003344 { .procname = "neigh",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003345 .mode = 0555, .child = empty},
Al Viro2f4520d2008-08-25 15:17:44 -07003346 { }
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003347};
3348
Al Viro2f4520d2008-08-25 15:17:44 -07003349static __net_initdata struct ctl_path ipv4_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003350 { .procname = "net", },
3351 { .procname = "ipv4", },
Al Viro2f4520d2008-08-25 15:17:44 -07003352 { },
3353};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003354
3355static struct ctl_table ipv4_route_flush_table[] = {
3356 {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003357 .procname = "flush",
3358 .maxlen = sizeof(int),
3359 .mode = 0200,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003360 .proc_handler = ipv4_sysctl_rtcache_flush,
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003361 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003362 { },
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003363};
3364
Al Viro2f4520d2008-08-25 15:17:44 -07003365static __net_initdata struct ctl_path ipv4_route_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003366 { .procname = "net", },
3367 { .procname = "ipv4", },
3368 { .procname = "route", },
Al Viro2f4520d2008-08-25 15:17:44 -07003369 { },
3370};
3371
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003372static __net_init int sysctl_route_net_init(struct net *net)
3373{
3374 struct ctl_table *tbl;
3375
3376 tbl = ipv4_route_flush_table;
Octavian Purdila09ad9bc2009-11-25 15:14:13 -08003377 if (!net_eq(net, &init_net)) {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003378 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3379 if (tbl == NULL)
3380 goto err_dup;
3381 }
3382 tbl[0].extra1 = net;
3383
3384 net->ipv4.route_hdr =
3385 register_net_sysctl_table(net, ipv4_route_path, tbl);
3386 if (net->ipv4.route_hdr == NULL)
3387 goto err_reg;
3388 return 0;
3389
3390err_reg:
3391 if (tbl != ipv4_route_flush_table)
3392 kfree(tbl);
3393err_dup:
3394 return -ENOMEM;
3395}
3396
3397static __net_exit void sysctl_route_net_exit(struct net *net)
3398{
3399 struct ctl_table *tbl;
3400
3401 tbl = net->ipv4.route_hdr->ctl_table_arg;
3402 unregister_net_sysctl_table(net->ipv4.route_hdr);
3403 BUG_ON(tbl == ipv4_route_flush_table);
3404 kfree(tbl);
3405}
3406
3407static __net_initdata struct pernet_operations sysctl_route_ops = {
3408 .init = sysctl_route_net_init,
3409 .exit = sysctl_route_net_exit,
3410};
Linus Torvalds1da177e2005-04-16 15:20:36 -07003411#endif
3412
Neil Horman3ee94372010-05-08 01:57:52 -07003413static __net_init int rt_genid_init(struct net *net)
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003414{
Neil Horman3ee94372010-05-08 01:57:52 -07003415 get_random_bytes(&net->ipv4.rt_genid,
3416 sizeof(net->ipv4.rt_genid));
David S. Miller436c3b62011-03-24 17:42:21 -07003417 get_random_bytes(&net->ipv4.dev_addr_genid,
3418 sizeof(net->ipv4.dev_addr_genid));
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003419 return 0;
3420}
3421
Neil Horman3ee94372010-05-08 01:57:52 -07003422static __net_initdata struct pernet_operations rt_genid_ops = {
3423 .init = rt_genid_init,
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003424};
3425
3426
Patrick McHardyc7066f72011-01-14 13:36:42 +01003427#ifdef CONFIG_IP_ROUTE_CLASSID
Tejun Heo7d720c32010-02-16 15:20:26 +00003428struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
Patrick McHardyc7066f72011-01-14 13:36:42 +01003429#endif /* CONFIG_IP_ROUTE_CLASSID */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003430
3431static __initdata unsigned long rhash_entries;
3432static int __init set_rhash_entries(char *str)
3433{
3434 if (!str)
3435 return 0;
3436 rhash_entries = simple_strtoul(str, &str, 0);
3437 return 1;
3438}
3439__setup("rhash_entries=", set_rhash_entries);
3440
3441int __init ip_rt_init(void)
3442{
Eric Dumazet424c4b72005-07-05 14:58:19 -07003443 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003444
Patrick McHardyc7066f72011-01-14 13:36:42 +01003445#ifdef CONFIG_IP_ROUTE_CLASSID
Ingo Molnar0dcec8c2009-02-25 14:07:33 +01003446 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003447 if (!ip_rt_acct)
3448 panic("IP: failed to allocate ip_rt_acct\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003449#endif
3450
Alexey Dobriyane5d679f332006-08-26 19:25:52 -07003451 ipv4_dst_ops.kmem_cachep =
3452 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09003453 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003454
David S. Miller14e50e52007-05-24 18:17:54 -07003455 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3456
Eric Dumazetfc66f952010-10-08 06:37:34 +00003457 if (dst_entries_init(&ipv4_dst_ops) < 0)
3458 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3459
3460 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3461 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3462
Eric Dumazet424c4b72005-07-05 14:58:19 -07003463 rt_hash_table = (struct rt_hash_bucket *)
3464 alloc_large_system_hash("IP route cache",
3465 sizeof(struct rt_hash_bucket),
3466 rhash_entries,
Jan Beulich44813742009-09-21 17:03:05 -07003467 (totalram_pages >= 128 * 1024) ?
Mike Stroyan18955cf2005-11-29 16:12:55 -08003468 15 : 17,
Kirill Korotaev8d1502d2006-08-07 20:44:22 -07003469 0,
Eric Dumazet424c4b72005-07-05 14:58:19 -07003470 &rt_hash_log,
3471 &rt_hash_mask,
Anton Blanchardc9503e02009-04-27 05:42:24 -07003472 rhash_entries ? 0 : 512 * 1024);
Eric Dumazet22c047c2005-07-05 14:55:24 -07003473 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3474 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003475
3476 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3477 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3478
Linus Torvalds1da177e2005-04-16 15:20:36 -07003479 devinet_init();
3480 ip_fib_init();
3481
Eric Dumazet9f28a2f2011-12-21 15:47:16 -05003482 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3483 expires_ljiffies = jiffies;
3484 schedule_delayed_work(&expires_work,
3485 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3486
Denis V. Lunev73b38712008-02-28 20:51:18 -08003487 if (ip_rt_proc_init())
Joe Perches058bd4d2012-03-11 18:36:11 +00003488 pr_err("Unable to create route proc files\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003489#ifdef CONFIG_XFRM
3490 xfrm_init();
Neil Hormana33bc5c2009-07-30 18:52:15 -07003491 xfrm4_init(ip_rt_max_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003492#endif
Greg Rosec7ac8672011-06-10 01:27:09 +00003493 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
Thomas Graf63f34442007-03-22 11:55:17 -07003494
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003495#ifdef CONFIG_SYSCTL
3496 register_pernet_subsys(&sysctl_route_ops);
3497#endif
Neil Horman3ee94372010-05-08 01:57:52 -07003498 register_pernet_subsys(&rt_genid_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003499 return rc;
3500}
3501
Al Viroa1bc6eb2008-07-30 06:32:52 -04003502#ifdef CONFIG_SYSCTL
Al Viroeeb61f72008-07-27 08:59:33 +01003503/*
3504 * We really need to sanitize the damn ipv4 init order, then all
3505 * this nonsense will go away.
3506 */
3507void __init ip_static_sysctl_init(void)
3508{
Al Viro2f4520d2008-08-25 15:17:44 -07003509 register_sysctl_paths(ipv4_path, ipv4_skeleton);
Al Viroeeb61f72008-07-27 08:59:33 +01003510}
Al Viroa1bc6eb2008-07-30 06:32:52 -04003511#endif