blob: 0197747961748aff73465a90bd689d70b3113880 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
Jesper Juhl02c30a82005-05-05 16:16:16 -07008 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -07009 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090021 * Alan Cox : Super /proc >4K
Linus Torvalds1da177e2005-04-16 15:20:36 -070022 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090039 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070055 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Ilia Sotnikovcef26852006-03-25 01:38:55 -080056 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
Linus Torvalds1da177e2005-04-16 15:20:36 -070058 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
Linus Torvalds1da177e2005-04-16 15:20:36 -070065#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070071#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070072#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070073#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
Eric Dumazet39c90ec2007-09-15 10:55:54 -070082#include <linux/workqueue.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070083#include <linux/skbuff.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070084#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090093#include <linux/slab.h>
Stephen Rothwellb9eda062011-12-22 17:03:29 +110094#include <linux/prefetch.h>
Herbert Xu352e5122007-11-13 21:34:06 -080095#include <net/dst.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020096#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070097#include <net/protocol.h>
98#include <net/ip.h>
99#include <net/route.h>
100#include <net/inetpeer.h>
101#include <net/sock.h>
102#include <net/ip_fib.h>
103#include <net/arp.h>
104#include <net/tcp.h>
105#include <net/icmp.h>
106#include <net/xfrm.h>
Tom Tucker8d717402006-07-30 20:43:36 -0700107#include <net/netevent.h>
Thomas Graf63f34442007-03-22 11:55:17 -0700108#include <net/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700109#ifdef CONFIG_SYSCTL
110#include <linux/sysctl.h>
111#endif
David S. Miller6e5714e2011-08-03 20:50:44 -0700112#include <net/secure_seq.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700113
David S. Miller68a5e3d2011-03-11 20:07:33 -0500114#define RT_FL_TOS(oldflp4) \
Julian Anastasovf61759e2011-12-02 11:39:42 +0000115 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700116
117#define IP_MAX_MTU 0xFFF0
118
119#define RT_GC_TIMEOUT (300*HZ)
120
Linus Torvalds1da177e2005-04-16 15:20:36 -0700121static int ip_rt_max_size;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700122static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500123static int ip_rt_gc_interval __read_mostly = 60 * HZ;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700124static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
125static int ip_rt_redirect_number __read_mostly = 9;
126static int ip_rt_redirect_load __read_mostly = HZ / 50;
127static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128static int ip_rt_error_cost __read_mostly = HZ;
129static int ip_rt_error_burst __read_mostly = 5 * HZ;
130static int ip_rt_gc_elasticity __read_mostly = 8;
131static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
132static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
133static int ip_rt_min_advmss __read_mostly = 256;
Neil Horman1080d702008-10-27 12:28:25 -0700134static int rt_chain_length_max __read_mostly = 20;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700135
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500136static struct delayed_work expires_work;
137static unsigned long expires_ljiffies;
138
Linus Torvalds1da177e2005-04-16 15:20:36 -0700139/*
140 * Interface to generic destination cache.
141 */
142
143static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
David S. Miller0dbaee32010-12-13 12:52:14 -0800144static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
Steffen Klassertebb762f2011-11-23 02:12:51 +0000145static unsigned int ipv4_mtu(const struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700146static void ipv4_dst_destroy(struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700147static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
148static void ipv4_link_failure(struct sk_buff *skb);
149static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
Daniel Lezcano569d3642008-01-18 03:56:57 -0800150static int rt_garbage_collect(struct dst_ops *ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700151
Eric Dumazet72cdd1d2010-11-11 07:14:07 +0000152static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
153 int how)
154{
155}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700156
David S. Miller62fa8a82011-01-26 20:51:05 -0800157static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
158{
David S. Miller06582542011-01-27 14:58:42 -0800159 struct rtable *rt = (struct rtable *) dst;
160 struct inet_peer *peer;
161 u32 *p = NULL;
David S. Miller62fa8a82011-01-26 20:51:05 -0800162
David S. Miller06582542011-01-27 14:58:42 -0800163 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -0400164 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller06582542011-01-27 14:58:42 -0800165
166 peer = rt->peer;
167 if (peer) {
David S. Miller62fa8a82011-01-26 20:51:05 -0800168 u32 *old_p = __DST_METRICS_PTR(old);
169 unsigned long prev, new;
170
David S. Miller06582542011-01-27 14:58:42 -0800171 p = peer->metrics;
172 if (inet_metrics_new(peer))
173 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
David S. Miller62fa8a82011-01-26 20:51:05 -0800174
175 new = (unsigned long) p;
176 prev = cmpxchg(&dst->_metrics, old, new);
177
178 if (prev != old) {
David S. Miller62fa8a82011-01-26 20:51:05 -0800179 p = __DST_METRICS_PTR(prev);
180 if (prev & DST_METRICS_READ_ONLY)
181 p = NULL;
182 } else {
David S. Miller62fa8a82011-01-26 20:51:05 -0800183 if (rt->fi) {
184 fib_info_put(rt->fi);
185 rt->fi = NULL;
186 }
187 }
188 }
189 return p;
190}
191
David S. Millerd3aaeb32011-07-18 00:40:17 -0700192static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
193
Linus Torvalds1da177e2005-04-16 15:20:36 -0700194static struct dst_ops ipv4_dst_ops = {
195 .family = AF_INET,
Harvey Harrison09640e62009-02-01 00:45:17 -0800196 .protocol = cpu_to_be16(ETH_P_IP),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700197 .gc = rt_garbage_collect,
198 .check = ipv4_dst_check,
David S. Miller0dbaee32010-12-13 12:52:14 -0800199 .default_advmss = ipv4_default_advmss,
Steffen Klassertebb762f2011-11-23 02:12:51 +0000200 .mtu = ipv4_mtu,
David S. Miller62fa8a82011-01-26 20:51:05 -0800201 .cow_metrics = ipv4_cow_metrics,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700202 .destroy = ipv4_dst_destroy,
203 .ifdown = ipv4_dst_ifdown,
204 .negative_advice = ipv4_negative_advice,
205 .link_failure = ipv4_link_failure,
206 .update_pmtu = ip_rt_update_pmtu,
Herbert Xu1ac06e02008-05-20 14:32:14 -0700207 .local_out = __ip_local_out,
David S. Millerd3aaeb32011-07-18 00:40:17 -0700208 .neigh_lookup = ipv4_neigh_lookup,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700209};
210
211#define ECN_OR_COST(class) TC_PRIO_##class
212
Philippe De Muyter4839c522007-07-09 15:32:57 -0700213const __u8 ip_tos2prio[16] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700214 TC_PRIO_BESTEFFORT,
Dan Siemon4a2b9c32011-03-15 13:56:07 +0000215 ECN_OR_COST(BESTEFFORT),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700216 TC_PRIO_BESTEFFORT,
217 ECN_OR_COST(BESTEFFORT),
218 TC_PRIO_BULK,
219 ECN_OR_COST(BULK),
220 TC_PRIO_BULK,
221 ECN_OR_COST(BULK),
222 TC_PRIO_INTERACTIVE,
223 ECN_OR_COST(INTERACTIVE),
224 TC_PRIO_INTERACTIVE,
225 ECN_OR_COST(INTERACTIVE),
226 TC_PRIO_INTERACTIVE_BULK,
227 ECN_OR_COST(INTERACTIVE_BULK),
228 TC_PRIO_INTERACTIVE_BULK,
229 ECN_OR_COST(INTERACTIVE_BULK)
230};
231
232
233/*
234 * Route cache.
235 */
236
237/* The locking scheme is rather straight forward:
238 *
239 * 1) Read-Copy Update protects the buckets of the central route hash.
240 * 2) Only writers remove entries, and they hold the lock
241 * as they look at rtable reference counts.
242 * 3) Only readers acquire references to rtable entries,
243 * they do so with atomic increments and with the
244 * lock held.
245 */
246
247struct rt_hash_bucket {
Eric Dumazet1c317202010-10-25 21:02:07 +0000248 struct rtable __rcu *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700249};
Neil Horman1080d702008-10-27 12:28:25 -0700250
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700251#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
252 defined(CONFIG_PROVE_LOCKING)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700253/*
254 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
255 * The size of this table is a power of two and depends on the number of CPUS.
Ingo Molnar62051202006-07-03 00:24:59 -0700256 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700257 */
Ingo Molnar62051202006-07-03 00:24:59 -0700258#ifdef CONFIG_LOCKDEP
259# define RT_HASH_LOCK_SZ 256
Eric Dumazet22c047c2005-07-05 14:55:24 -0700260#else
Ingo Molnar62051202006-07-03 00:24:59 -0700261# if NR_CPUS >= 32
262# define RT_HASH_LOCK_SZ 4096
263# elif NR_CPUS >= 16
264# define RT_HASH_LOCK_SZ 2048
265# elif NR_CPUS >= 8
266# define RT_HASH_LOCK_SZ 1024
267# elif NR_CPUS >= 4
268# define RT_HASH_LOCK_SZ 512
269# else
270# define RT_HASH_LOCK_SZ 256
271# endif
Eric Dumazet22c047c2005-07-05 14:55:24 -0700272#endif
273
274static spinlock_t *rt_hash_locks;
275# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800276
277static __init void rt_hash_lock_init(void)
278{
279 int i;
280
281 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
282 GFP_KERNEL);
283 if (!rt_hash_locks)
284 panic("IP: failed to allocate rt_hash_locks\n");
285
286 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
287 spin_lock_init(&rt_hash_locks[i]);
288}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700289#else
290# define rt_hash_lock_addr(slot) NULL
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800291
292static inline void rt_hash_lock_init(void)
293{
294}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700295#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700296
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700297static struct rt_hash_bucket *rt_hash_table __read_mostly;
298static unsigned rt_hash_mask __read_mostly;
299static unsigned int rt_hash_log __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700300
Eric Dumazet2f970d82006-01-17 02:54:36 -0800301static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
Eric Dumazet27f39c72010-05-19 22:07:23 +0000302#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700303
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700304static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700305 int genid)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700306{
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700307 return jhash_3words((__force u32)daddr, (__force u32)saddr,
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700308 idx, genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800309 & rt_hash_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700310}
311
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700312static inline int rt_genid(struct net *net)
313{
314 return atomic_read(&net->ipv4.rt_genid);
315}
316
Linus Torvalds1da177e2005-04-16 15:20:36 -0700317#ifdef CONFIG_PROC_FS
318struct rt_cache_iter_state {
Denis V. Luneva75e9362008-02-28 20:50:55 -0800319 struct seq_net_private p;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700320 int bucket;
Eric Dumazet29e75252008-01-31 17:05:09 -0800321 int genid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700322};
323
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900324static struct rtable *rt_cache_get_first(struct seq_file *seq)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700325{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900326 struct rt_cache_iter_state *st = seq->private;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700327 struct rtable *r = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700328
329 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
Eric Dumazet33d480c2011-08-11 19:30:52 +0000330 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
Eric Dumazeta6272662008-08-28 01:11:25 -0700331 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700332 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -0800333 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Eric Dumazet29e75252008-01-31 17:05:09 -0800334 while (r) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700335 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
Denis V. Luneva75e9362008-02-28 20:50:55 -0800336 r->rt_genid == st->genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800337 return r;
Changli Gaod8d1f302010-06-10 23:31:35 -0700338 r = rcu_dereference_bh(r->dst.rt_next);
Eric Dumazet29e75252008-01-31 17:05:09 -0800339 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700340 rcu_read_unlock_bh();
341 }
Eric Dumazet29e75252008-01-31 17:05:09 -0800342 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700343}
344
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900345static struct rtable *__rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800346 struct rtable *r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700347{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900348 struct rt_cache_iter_state *st = seq->private;
Eric Dumazeta6272662008-08-28 01:11:25 -0700349
Eric Dumazet1c317202010-10-25 21:02:07 +0000350 r = rcu_dereference_bh(r->dst.rt_next);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700351 while (!r) {
352 rcu_read_unlock_bh();
Eric Dumazeta6272662008-08-28 01:11:25 -0700353 do {
354 if (--st->bucket < 0)
355 return NULL;
Eric Dumazet33d480c2011-08-11 19:30:52 +0000356 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700357 rcu_read_lock_bh();
Eric Dumazet1c317202010-10-25 21:02:07 +0000358 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700359 }
Eric Dumazet1c317202010-10-25 21:02:07 +0000360 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700361}
362
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900363static struct rtable *rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800364 struct rtable *r)
365{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900366 struct rt_cache_iter_state *st = seq->private;
367 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700368 if (dev_net(r->dst.dev) != seq_file_net(seq))
Denis V. Luneva75e9362008-02-28 20:50:55 -0800369 continue;
Denis V. Lunev642d6312008-02-28 20:50:33 -0800370 if (r->rt_genid == st->genid)
371 break;
372 }
373 return r;
374}
375
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900376static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700377{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900378 struct rtable *r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700379
380 if (r)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900381 while (pos && (r = rt_cache_get_next(seq, r)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700382 --pos;
383 return pos ? NULL : r;
384}
385
386static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
387{
Eric Dumazet29e75252008-01-31 17:05:09 -0800388 struct rt_cache_iter_state *st = seq->private;
Eric Dumazet29e75252008-01-31 17:05:09 -0800389 if (*pos)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900390 return rt_cache_get_idx(seq, *pos - 1);
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700391 st->genid = rt_genid(seq_file_net(seq));
Eric Dumazet29e75252008-01-31 17:05:09 -0800392 return SEQ_START_TOKEN;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700393}
394
395static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
396{
Eric Dumazet29e75252008-01-31 17:05:09 -0800397 struct rtable *r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700398
399 if (v == SEQ_START_TOKEN)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900400 r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700401 else
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900402 r = rt_cache_get_next(seq, v);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700403 ++*pos;
404 return r;
405}
406
407static void rt_cache_seq_stop(struct seq_file *seq, void *v)
408{
409 if (v && v != SEQ_START_TOKEN)
410 rcu_read_unlock_bh();
411}
412
413static int rt_cache_seq_show(struct seq_file *seq, void *v)
414{
415 if (v == SEQ_START_TOKEN)
416 seq_printf(seq, "%-127s\n",
417 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
418 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
419 "HHUptod\tSpecDst");
420 else {
421 struct rtable *r = v;
David S. Miller69cce1d2011-07-17 23:09:49 -0700422 struct neighbour *n;
Eric Dumazet218fa902011-11-29 20:05:55 +0000423 int len, HHUptod;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700424
Eric Dumazet218fa902011-11-29 20:05:55 +0000425 rcu_read_lock();
David Miller27217452011-12-02 16:52:08 +0000426 n = dst_get_neighbour_noref(&r->dst);
Eric Dumazet218fa902011-11-29 20:05:55 +0000427 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
428 rcu_read_unlock();
429
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700430 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
431 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
Changli Gaod8d1f302010-06-10 23:31:35 -0700432 r->dst.dev ? r->dst.dev->name : "*",
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700433 (__force u32)r->rt_dst,
434 (__force u32)r->rt_gateway,
Changli Gaod8d1f302010-06-10 23:31:35 -0700435 r->rt_flags, atomic_read(&r->dst.__refcnt),
436 r->dst.__use, 0, (__force u32)r->rt_src,
David S. Miller0dbaee32010-12-13 12:52:14 -0800437 dst_metric_advmss(&r->dst) + 40,
Changli Gaod8d1f302010-06-10 23:31:35 -0700438 dst_metric(&r->dst, RTAX_WINDOW),
439 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
440 dst_metric(&r->dst, RTAX_RTTVAR)),
David S. Miller475949d2011-05-03 19:45:15 -0700441 r->rt_key_tos,
David S. Millerf6b72b62011-07-14 07:53:20 -0700442 -1,
Eric Dumazet218fa902011-11-29 20:05:55 +0000443 HHUptod,
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700444 r->rt_spec_dst, &len);
445
446 seq_printf(seq, "%*s\n", 127 - len, "");
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900447 }
448 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700449}
450
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700451static const struct seq_operations rt_cache_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700452 .start = rt_cache_seq_start,
453 .next = rt_cache_seq_next,
454 .stop = rt_cache_seq_stop,
455 .show = rt_cache_seq_show,
456};
457
458static int rt_cache_seq_open(struct inode *inode, struct file *file)
459{
Denis V. Luneva75e9362008-02-28 20:50:55 -0800460 return seq_open_net(inode, file, &rt_cache_seq_ops,
Pavel Emelyanovcf7732e2007-10-10 02:29:29 -0700461 sizeof(struct rt_cache_iter_state));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700462}
463
Arjan van de Ven9a321442007-02-12 00:55:35 -0800464static const struct file_operations rt_cache_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700465 .owner = THIS_MODULE,
466 .open = rt_cache_seq_open,
467 .read = seq_read,
468 .llseek = seq_lseek,
Denis V. Luneva75e9362008-02-28 20:50:55 -0800469 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700470};
471
472
473static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
474{
475 int cpu;
476
477 if (*pos == 0)
478 return SEQ_START_TOKEN;
479
Rusty Russell0f231742008-12-29 12:23:42 +0000480 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700481 if (!cpu_possible(cpu))
482 continue;
483 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800484 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700485 }
486 return NULL;
487}
488
489static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
490{
491 int cpu;
492
Rusty Russell0f231742008-12-29 12:23:42 +0000493 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700494 if (!cpu_possible(cpu))
495 continue;
496 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800497 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700498 }
499 return NULL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900500
Linus Torvalds1da177e2005-04-16 15:20:36 -0700501}
502
503static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
504{
505
506}
507
508static int rt_cpu_seq_show(struct seq_file *seq, void *v)
509{
510 struct rt_cache_stat *st = v;
511
512 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700513 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700514 return 0;
515 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900516
Linus Torvalds1da177e2005-04-16 15:20:36 -0700517 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
518 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
Eric Dumazetfc66f952010-10-08 06:37:34 +0000519 dst_entries_get_slow(&ipv4_dst_ops),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700520 st->in_hit,
521 st->in_slow_tot,
522 st->in_slow_mc,
523 st->in_no_route,
524 st->in_brd,
525 st->in_martian_dst,
526 st->in_martian_src,
527
528 st->out_hit,
529 st->out_slow_tot,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900530 st->out_slow_mc,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700531
532 st->gc_total,
533 st->gc_ignored,
534 st->gc_goal_miss,
535 st->gc_dst_overflow,
536 st->in_hlist_search,
537 st->out_hlist_search
538 );
539 return 0;
540}
541
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700542static const struct seq_operations rt_cpu_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700543 .start = rt_cpu_seq_start,
544 .next = rt_cpu_seq_next,
545 .stop = rt_cpu_seq_stop,
546 .show = rt_cpu_seq_show,
547};
548
549
550static int rt_cpu_seq_open(struct inode *inode, struct file *file)
551{
552 return seq_open(file, &rt_cpu_seq_ops);
553}
554
Arjan van de Ven9a321442007-02-12 00:55:35 -0800555static const struct file_operations rt_cpu_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700556 .owner = THIS_MODULE,
557 .open = rt_cpu_seq_open,
558 .read = seq_read,
559 .llseek = seq_lseek,
560 .release = seq_release,
561};
562
Patrick McHardyc7066f72011-01-14 13:36:42 +0100563#ifdef CONFIG_IP_ROUTE_CLASSID
Alexey Dobriyana661c412009-11-25 15:40:35 -0800564static int rt_acct_proc_show(struct seq_file *m, void *v)
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800565{
Alexey Dobriyana661c412009-11-25 15:40:35 -0800566 struct ip_rt_acct *dst, *src;
567 unsigned int i, j;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800568
Alexey Dobriyana661c412009-11-25 15:40:35 -0800569 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
570 if (!dst)
571 return -ENOMEM;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800572
Alexey Dobriyana661c412009-11-25 15:40:35 -0800573 for_each_possible_cpu(i) {
574 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
575 for (j = 0; j < 256; j++) {
576 dst[j].o_bytes += src[j].o_bytes;
577 dst[j].o_packets += src[j].o_packets;
578 dst[j].i_bytes += src[j].i_bytes;
579 dst[j].i_packets += src[j].i_packets;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800580 }
581 }
Alexey Dobriyana661c412009-11-25 15:40:35 -0800582
583 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
584 kfree(dst);
585 return 0;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800586}
Alexey Dobriyana661c412009-11-25 15:40:35 -0800587
588static int rt_acct_proc_open(struct inode *inode, struct file *file)
589{
590 return single_open(file, rt_acct_proc_show, NULL);
591}
592
593static const struct file_operations rt_acct_proc_fops = {
594 .owner = THIS_MODULE,
595 .open = rt_acct_proc_open,
596 .read = seq_read,
597 .llseek = seq_lseek,
598 .release = single_release,
599};
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800600#endif
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800601
Denis V. Lunev73b38712008-02-28 20:51:18 -0800602static int __net_init ip_rt_do_proc_init(struct net *net)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800603{
604 struct proc_dir_entry *pde;
605
606 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
607 &rt_cache_seq_fops);
608 if (!pde)
609 goto err1;
610
Wang Chen77020722008-02-28 14:14:25 -0800611 pde = proc_create("rt_cache", S_IRUGO,
612 net->proc_net_stat, &rt_cpu_seq_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800613 if (!pde)
614 goto err2;
615
Patrick McHardyc7066f72011-01-14 13:36:42 +0100616#ifdef CONFIG_IP_ROUTE_CLASSID
Alexey Dobriyana661c412009-11-25 15:40:35 -0800617 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800618 if (!pde)
619 goto err3;
620#endif
621 return 0;
622
Patrick McHardyc7066f72011-01-14 13:36:42 +0100623#ifdef CONFIG_IP_ROUTE_CLASSID
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800624err3:
625 remove_proc_entry("rt_cache", net->proc_net_stat);
626#endif
627err2:
628 remove_proc_entry("rt_cache", net->proc_net);
629err1:
630 return -ENOMEM;
631}
Denis V. Lunev73b38712008-02-28 20:51:18 -0800632
633static void __net_exit ip_rt_do_proc_exit(struct net *net)
634{
635 remove_proc_entry("rt_cache", net->proc_net_stat);
636 remove_proc_entry("rt_cache", net->proc_net);
Patrick McHardyc7066f72011-01-14 13:36:42 +0100637#ifdef CONFIG_IP_ROUTE_CLASSID
Denis V. Lunev73b38712008-02-28 20:51:18 -0800638 remove_proc_entry("rt_acct", net->proc_net);
Alexey Dobriyan0a931ac2010-01-17 03:32:50 +0000639#endif
Denis V. Lunev73b38712008-02-28 20:51:18 -0800640}
641
642static struct pernet_operations ip_rt_proc_ops __net_initdata = {
643 .init = ip_rt_do_proc_init,
644 .exit = ip_rt_do_proc_exit,
645};
646
647static int __init ip_rt_proc_init(void)
648{
649 return register_pernet_subsys(&ip_rt_proc_ops);
650}
651
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800652#else
Denis V. Lunev73b38712008-02-28 20:51:18 -0800653static inline int ip_rt_proc_init(void)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800654{
655 return 0;
656}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700657#endif /* CONFIG_PROC_FS */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900658
Stephen Hemminger5969f712008-04-10 01:52:09 -0700659static inline void rt_free(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700660{
Changli Gaod8d1f302010-06-10 23:31:35 -0700661 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700662}
663
Stephen Hemminger5969f712008-04-10 01:52:09 -0700664static inline void rt_drop(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700665{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700666 ip_rt_put(rt);
Changli Gaod8d1f302010-06-10 23:31:35 -0700667 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700668}
669
Stephen Hemminger5969f712008-04-10 01:52:09 -0700670static inline int rt_fast_clean(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700671{
672 /* Kill broadcast/multicast entries very aggresively, if they
673 collide in hash table with more useful entries */
674 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
David S. Millerc7537962010-11-11 17:07:48 -0800675 rt_is_input_route(rth) && rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700676}
677
Stephen Hemminger5969f712008-04-10 01:52:09 -0700678static inline int rt_valuable(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700679{
680 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
David S. Miller2c8cec52011-02-09 20:42:07 -0800681 (rth->peer && rth->peer->pmtu_expires);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700682}
683
684static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
685{
686 unsigned long age;
687 int ret = 0;
688
Changli Gaod8d1f302010-06-10 23:31:35 -0700689 if (atomic_read(&rth->dst.__refcnt))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700690 goto out;
691
Changli Gaod8d1f302010-06-10 23:31:35 -0700692 age = jiffies - rth->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700693 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
694 (age <= tmo2 && rt_valuable(rth)))
695 goto out;
696 ret = 1;
697out: return ret;
698}
699
700/* Bits of score are:
701 * 31: very valuable
702 * 30: not quite useless
703 * 29..0: usage counter
704 */
705static inline u32 rt_score(struct rtable *rt)
706{
Changli Gaod8d1f302010-06-10 23:31:35 -0700707 u32 score = jiffies - rt->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700708
709 score = ~score & ~(3<<30);
710
711 if (rt_valuable(rt))
712 score |= (1<<31);
713
David S. Millerc7537962010-11-11 17:07:48 -0800714 if (rt_is_output_route(rt) ||
Linus Torvalds1da177e2005-04-16 15:20:36 -0700715 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
716 score |= (1<<30);
717
718 return score;
719}
720
Neil Horman1080d702008-10-27 12:28:25 -0700721static inline bool rt_caching(const struct net *net)
722{
723 return net->ipv4.current_rt_cache_rebuild_count <=
724 net->ipv4.sysctl_rt_cache_rebuild_count;
725}
726
David S. Miller5e2b61f2011-03-04 21:47:09 -0800727static inline bool compare_hash_inputs(const struct rtable *rt1,
728 const struct rtable *rt2)
Neil Horman1080d702008-10-27 12:28:25 -0700729{
David S. Miller5e2b61f2011-03-04 21:47:09 -0800730 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
731 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
Julian Anastasov97a80412011-08-09 04:01:16 +0000732 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
Neil Horman1080d702008-10-27 12:28:25 -0700733}
734
David S. Miller5e2b61f2011-03-04 21:47:09 -0800735static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700736{
David S. Miller5e2b61f2011-03-04 21:47:09 -0800737 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
738 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
739 (rt1->rt_mark ^ rt2->rt_mark) |
David S. Miller475949d2011-05-03 19:45:15 -0700740 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
Julian Anastasovd547f722011-08-07 22:20:20 -0700741 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
Julian Anastasov97a80412011-08-09 04:01:16 +0000742 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700743}
744
Denis V. Lunevb5921912008-01-22 23:50:25 -0800745static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
746{
Changli Gaod8d1f302010-06-10 23:31:35 -0700747 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
Denis V. Lunevb5921912008-01-22 23:50:25 -0800748}
749
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700750static inline int rt_is_expired(struct rtable *rth)
751{
Changli Gaod8d1f302010-06-10 23:31:35 -0700752 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700753}
754
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800755/*
756 * Perform a full scan of hash table and free all entries.
757 * Can be called by a softirq or a process.
758 * In the later case, we want to be reschedule if necessary
759 */
David S. Miller6561a3b2010-12-19 21:11:20 -0800760static void rt_do_flush(struct net *net, int process_context)
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800761{
762 unsigned int i;
763 struct rtable *rth, *next;
764
765 for (i = 0; i <= rt_hash_mask; i++) {
David S. Miller6561a3b2010-12-19 21:11:20 -0800766 struct rtable __rcu **pprev;
767 struct rtable *list;
768
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800769 if (process_context && need_resched())
770 cond_resched();
Eric Dumazet33d480c2011-08-11 19:30:52 +0000771 rth = rcu_access_pointer(rt_hash_table[i].chain);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800772 if (!rth)
773 continue;
774
775 spin_lock_bh(rt_hash_lock_addr(i));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700776
David S. Miller6561a3b2010-12-19 21:11:20 -0800777 list = NULL;
778 pprev = &rt_hash_table[i].chain;
779 rth = rcu_dereference_protected(*pprev,
Eric Dumazet1c317202010-10-25 21:02:07 +0000780 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700781
David S. Miller6561a3b2010-12-19 21:11:20 -0800782 while (rth) {
783 next = rcu_dereference_protected(rth->dst.rt_next,
784 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700785
David S. Miller6561a3b2010-12-19 21:11:20 -0800786 if (!net ||
787 net_eq(dev_net(rth->dst.dev), net)) {
788 rcu_assign_pointer(*pprev, next);
789 rcu_assign_pointer(rth->dst.rt_next, list);
790 list = rth;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700791 } else {
David S. Miller6561a3b2010-12-19 21:11:20 -0800792 pprev = &rth->dst.rt_next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700793 }
David S. Miller6561a3b2010-12-19 21:11:20 -0800794 rth = next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700795 }
David S. Miller6561a3b2010-12-19 21:11:20 -0800796
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800797 spin_unlock_bh(rt_hash_lock_addr(i));
798
David S. Miller6561a3b2010-12-19 21:11:20 -0800799 for (; list; list = next) {
800 next = rcu_dereference_protected(list->dst.rt_next, 1);
801 rt_free(list);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800802 }
803 }
804}
805
Neil Horman1080d702008-10-27 12:28:25 -0700806/*
807 * While freeing expired entries, we compute average chain length
808 * and standard deviation, using fixed-point arithmetic.
809 * This to have an estimation of rt_chain_length_max
810 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
811 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
812 */
813
814#define FRACT_BITS 3
815#define ONE (1UL << FRACT_BITS)
816
Eric Dumazet98376382010-03-08 03:20:00 +0000817/*
818 * Given a hash chain and an item in this hash chain,
819 * find if a previous entry has the same hash_inputs
820 * (but differs on tos, mark or oif)
821 * Returns 0 if an alias is found.
822 * Returns ONE if rth has no alias before itself.
823 */
824static int has_noalias(const struct rtable *head, const struct rtable *rth)
825{
826 const struct rtable *aux = head;
827
828 while (aux != rth) {
David S. Miller5e2b61f2011-03-04 21:47:09 -0800829 if (compare_hash_inputs(aux, rth))
Eric Dumazet98376382010-03-08 03:20:00 +0000830 return 0;
Eric Dumazet1c317202010-10-25 21:02:07 +0000831 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +0000832 }
833 return ONE;
834}
835
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500836static void rt_check_expire(void)
837{
838 static unsigned int rover;
839 unsigned int i = rover, goal;
840 struct rtable *rth;
841 struct rtable __rcu **rthp;
842 unsigned long samples = 0;
843 unsigned long sum = 0, sum2 = 0;
844 unsigned long delta;
845 u64 mult;
846
847 delta = jiffies - expires_ljiffies;
848 expires_ljiffies = jiffies;
849 mult = ((u64)delta) << rt_hash_log;
850 if (ip_rt_gc_timeout > 1)
851 do_div(mult, ip_rt_gc_timeout);
852 goal = (unsigned int)mult;
853 if (goal > rt_hash_mask)
854 goal = rt_hash_mask + 1;
855 for (; goal > 0; goal--) {
856 unsigned long tmo = ip_rt_gc_timeout;
857 unsigned long length;
858
859 i = (i + 1) & rt_hash_mask;
860 rthp = &rt_hash_table[i].chain;
861
862 if (need_resched())
863 cond_resched();
864
865 samples++;
866
867 if (rcu_dereference_raw(*rthp) == NULL)
868 continue;
869 length = 0;
870 spin_lock_bh(rt_hash_lock_addr(i));
871 while ((rth = rcu_dereference_protected(*rthp,
872 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
873 prefetch(rth->dst.rt_next);
874 if (rt_is_expired(rth)) {
875 *rthp = rth->dst.rt_next;
876 rt_free(rth);
877 continue;
878 }
879 if (rth->dst.expires) {
880 /* Entry is expired even if it is in use */
881 if (time_before_eq(jiffies, rth->dst.expires)) {
882nofree:
883 tmo >>= 1;
884 rthp = &rth->dst.rt_next;
885 /*
886 * We only count entries on
887 * a chain with equal hash inputs once
888 * so that entries for different QOS
889 * levels, and other non-hash input
890 * attributes don't unfairly skew
891 * the length computation
892 */
893 length += has_noalias(rt_hash_table[i].chain, rth);
894 continue;
895 }
896 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
897 goto nofree;
898
899 /* Cleanup aged off entries. */
900 *rthp = rth->dst.rt_next;
901 rt_free(rth);
902 }
903 spin_unlock_bh(rt_hash_lock_addr(i));
904 sum += length;
905 sum2 += length*length;
906 }
907 if (samples) {
908 unsigned long avg = sum / samples;
909 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
910 rt_chain_length_max = max_t(unsigned long,
911 ip_rt_gc_elasticity,
912 (avg + 4*sd) >> FRACT_BITS);
913 }
914 rover = i;
915}
916
917/*
918 * rt_worker_func() is run in process context.
919 * we call rt_check_expire() to scan part of the hash table
920 */
921static void rt_worker_func(struct work_struct *work)
922{
923 rt_check_expire();
924 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
925}
926
Eric Dumazet29e75252008-01-31 17:05:09 -0800927/*
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300928 * Perturbation of rt_genid by a small quantity [1..256]
Eric Dumazet29e75252008-01-31 17:05:09 -0800929 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
930 * many times (2^24) without giving recent rt_genid.
931 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700932 */
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700933static void rt_cache_invalidate(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700934{
Eric Dumazet29e75252008-01-31 17:05:09 -0800935 unsigned char shuffle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700936
Eric Dumazet29e75252008-01-31 17:05:09 -0800937 get_random_bytes(&shuffle, sizeof(shuffle));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700938 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
Steffen Klassert5faa5df2012-03-06 21:20:26 +0000939 inetpeer_invalidate_tree(AF_INET);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700940}
941
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800942/*
Eric Dumazet29e75252008-01-31 17:05:09 -0800943 * delay < 0 : invalidate cache (fast : entries will be deleted later)
944 * delay >= 0 : invalidate & flush cache (can be long)
945 */
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -0700946void rt_cache_flush(struct net *net, int delay)
Eric Dumazet29e75252008-01-31 17:05:09 -0800947{
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700948 rt_cache_invalidate(net);
Eric Dumazet29e75252008-01-31 17:05:09 -0800949 if (delay >= 0)
David S. Miller6561a3b2010-12-19 21:11:20 -0800950 rt_do_flush(net, !in_softirq());
Eric Dumazet29e75252008-01-31 17:05:09 -0800951}
952
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000953/* Flush previous cache invalidated entries from the cache */
David S. Miller6561a3b2010-12-19 21:11:20 -0800954void rt_cache_flush_batch(struct net *net)
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000955{
David S. Miller6561a3b2010-12-19 21:11:20 -0800956 rt_do_flush(net, !in_softirq());
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000957}
958
Neil Horman1080d702008-10-27 12:28:25 -0700959static void rt_emergency_hash_rebuild(struct net *net)
960{
Neil Horman3ee94372010-05-08 01:57:52 -0700961 if (net_ratelimit())
Neil Horman1080d702008-10-27 12:28:25 -0700962 printk(KERN_WARNING "Route hash chain too long!\n");
Neil Horman3ee94372010-05-08 01:57:52 -0700963 rt_cache_invalidate(net);
Neil Horman1080d702008-10-27 12:28:25 -0700964}
965
Linus Torvalds1da177e2005-04-16 15:20:36 -0700966/*
967 Short description of GC goals.
968
969 We want to build algorithm, which will keep routing cache
970 at some equilibrium point, when number of aged off entries
971 is kept approximately equal to newly generated ones.
972
973 Current expiration strength is variable "expire".
974 We try to adjust it dynamically, so that if networking
975 is idle expires is large enough to keep enough of warm entries,
976 and when load increases it reduces to limit cache size.
977 */
978
Daniel Lezcano569d3642008-01-18 03:56:57 -0800979static int rt_garbage_collect(struct dst_ops *ops)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700980{
981 static unsigned long expire = RT_GC_TIMEOUT;
982 static unsigned long last_gc;
983 static int rover;
984 static int equilibrium;
Eric Dumazet1c317202010-10-25 21:02:07 +0000985 struct rtable *rth;
986 struct rtable __rcu **rthp;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700987 unsigned long now = jiffies;
988 int goal;
Eric Dumazetfc66f952010-10-08 06:37:34 +0000989 int entries = dst_entries_get_fast(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700990
991 /*
992 * Garbage collection is pretty expensive,
993 * do not make it too frequently.
994 */
995
996 RT_CACHE_STAT_INC(gc_total);
997
998 if (now - last_gc < ip_rt_gc_min_interval &&
Eric Dumazetfc66f952010-10-08 06:37:34 +0000999 entries < ip_rt_max_size) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001000 RT_CACHE_STAT_INC(gc_ignored);
1001 goto out;
1002 }
1003
Eric Dumazetfc66f952010-10-08 06:37:34 +00001004 entries = dst_entries_get_slow(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001005 /* Calculate number of entries, which we want to expire now. */
Eric Dumazetfc66f952010-10-08 06:37:34 +00001006 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001007 if (goal <= 0) {
1008 if (equilibrium < ipv4_dst_ops.gc_thresh)
1009 equilibrium = ipv4_dst_ops.gc_thresh;
Eric Dumazetfc66f952010-10-08 06:37:34 +00001010 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001011 if (goal > 0) {
Eric Dumazetb790ced2007-12-21 01:49:07 -08001012 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +00001013 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001014 }
1015 } else {
1016 /* We are in dangerous area. Try to reduce cache really
1017 * aggressively.
1018 */
Eric Dumazetb790ced2007-12-21 01:49:07 -08001019 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +00001020 equilibrium = entries - goal;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001021 }
1022
1023 if (now - last_gc >= ip_rt_gc_min_interval)
1024 last_gc = now;
1025
1026 if (goal <= 0) {
1027 equilibrium += goal;
1028 goto work_done;
1029 }
1030
1031 do {
1032 int i, k;
1033
1034 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1035 unsigned long tmo = expire;
1036
1037 k = (k + 1) & rt_hash_mask;
1038 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001039 spin_lock_bh(rt_hash_lock_addr(k));
Eric Dumazet1c317202010-10-25 21:02:07 +00001040 while ((rth = rcu_dereference_protected(*rthp,
1041 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001042 if (!rt_is_expired(rth) &&
Eric Dumazet29e75252008-01-31 17:05:09 -08001043 !rt_may_expire(rth, tmo, expire)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001044 tmo >>= 1;
Changli Gaod8d1f302010-06-10 23:31:35 -07001045 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001046 continue;
1047 }
Changli Gaod8d1f302010-06-10 23:31:35 -07001048 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001049 rt_free(rth);
1050 goal--;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001051 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001052 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001053 if (goal <= 0)
1054 break;
1055 }
1056 rover = k;
1057
1058 if (goal <= 0)
1059 goto work_done;
1060
1061 /* Goal is not achieved. We stop process if:
1062
1063 - if expire reduced to zero. Otherwise, expire is halfed.
1064 - if table is not full.
1065 - if we are called from interrupt.
1066 - jiffies check is just fallback/debug loop breaker.
1067 We will not spin here for long time in any case.
1068 */
1069
1070 RT_CACHE_STAT_INC(gc_goal_miss);
1071
1072 if (expire == 0)
1073 break;
1074
1075 expire >>= 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001076
Eric Dumazetfc66f952010-10-08 06:37:34 +00001077 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001078 goto out;
1079 } while (!in_softirq() && time_before_eq(jiffies, now));
1080
Eric Dumazetfc66f952010-10-08 06:37:34 +00001081 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1082 goto out;
1083 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001084 goto out;
1085 if (net_ratelimit())
1086 printk(KERN_WARNING "dst cache overflow\n");
1087 RT_CACHE_STAT_INC(gc_dst_overflow);
1088 return 1;
1089
1090work_done:
1091 expire += ip_rt_gc_min_interval;
1092 if (expire > ip_rt_gc_timeout ||
Eric Dumazetfc66f952010-10-08 06:37:34 +00001093 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1094 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001095 expire = ip_rt_gc_timeout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001096out: return 0;
1097}
1098
Eric Dumazet98376382010-03-08 03:20:00 +00001099/*
1100 * Returns number of entries in a hash chain that have different hash_inputs
1101 */
1102static int slow_chain_length(const struct rtable *head)
1103{
1104 int length = 0;
1105 const struct rtable *rth = head;
1106
1107 while (rth) {
1108 length += has_noalias(head, rth);
Eric Dumazet1c317202010-10-25 21:02:07 +00001109 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +00001110 }
1111 return length >> FRACT_BITS;
1112}
1113
David S. Millerd3aaeb32011-07-18 00:40:17 -07001114static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
David Miller3769cff2011-07-11 22:44:24 +00001115{
David S. Millerd3aaeb32011-07-18 00:40:17 -07001116 static const __be32 inaddr_any = 0;
1117 struct net_device *dev = dst->dev;
1118 const __be32 *pkey = daddr;
David Miller3769cff2011-07-11 22:44:24 +00001119 struct neighbour *n;
1120
David Miller3769cff2011-07-11 22:44:24 +00001121 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
David S. Millerd3aaeb32011-07-18 00:40:17 -07001122 pkey = &inaddr_any;
1123
David Miller32092ec2011-07-25 00:01:41 +00001124 n = __ipv4_neigh_lookup(&arp_tbl, dev, *(__force u32 *)pkey);
David S. Millerd3aaeb32011-07-18 00:40:17 -07001125 if (n)
1126 return n;
David Miller32092ec2011-07-25 00:01:41 +00001127 return neigh_create(&arp_tbl, pkey, dev);
David S. Millerd3aaeb32011-07-18 00:40:17 -07001128}
1129
1130static int rt_bind_neighbour(struct rtable *rt)
1131{
1132 struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
David Miller3769cff2011-07-11 22:44:24 +00001133 if (IS_ERR(n))
1134 return PTR_ERR(n);
David S. Miller69cce1d2011-07-17 23:09:49 -07001135 dst_set_neighbour(&rt->dst, n);
David Miller3769cff2011-07-11 22:44:24 +00001136
1137 return 0;
1138}
1139
David S. Millerb23dd4f2011-03-02 14:31:35 -08001140static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1141 struct sk_buff *skb, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001142{
Eric Dumazet1c317202010-10-25 21:02:07 +00001143 struct rtable *rth, *cand;
1144 struct rtable __rcu **rthp, **candp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001145 unsigned long now;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001146 u32 min_score;
1147 int chain_length;
1148 int attempts = !in_softirq();
1149
1150restart:
1151 chain_length = 0;
1152 min_score = ~(u32)0;
1153 cand = NULL;
1154 candp = NULL;
1155 now = jiffies;
1156
Changli Gaod8d1f302010-06-10 23:31:35 -07001157 if (!rt_caching(dev_net(rt->dst.dev))) {
Neil Horman73e42892009-06-20 01:15:16 -07001158 /*
1159 * If we're not caching, just tell the caller we
1160 * were successful and don't touch the route. The
1161 * caller hold the sole reference to the cache entry, and
1162 * it will be released when the caller is done with it.
1163 * If we drop it here, the callers have no way to resolve routes
1164 * when we're not caching. Instead, just point *rp at rt, so
1165 * the caller gets a single use out of the route
Neil Hormanb6280b42009-06-22 10:18:53 +00001166 * Note that we do rt_free on this new route entry, so that
1167 * once its refcount hits zero, we are still able to reap it
1168 * (Thanks Alexey)
Eric Dumazet27b75c92010-10-15 05:44:11 +00001169 * Note: To avoid expensive rcu stuff for this uncached dst,
1170 * we set DST_NOCACHE so that dst_release() can free dst without
1171 * waiting a grace period.
Neil Horman73e42892009-06-20 01:15:16 -07001172 */
Neil Hormanb6280b42009-06-22 10:18:53 +00001173
Eric Dumazetc7d44262010-10-03 22:17:54 -07001174 rt->dst.flags |= DST_NOCACHE;
David S. Millerc7537962010-11-11 17:07:48 -08001175 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
David Miller3769cff2011-07-11 22:44:24 +00001176 int err = rt_bind_neighbour(rt);
Neil Hormanb6280b42009-06-22 10:18:53 +00001177 if (err) {
1178 if (net_ratelimit())
1179 printk(KERN_WARNING
1180 "Neighbour table failure & not caching routes.\n");
Eric Dumazet27b75c92010-10-15 05:44:11 +00001181 ip_rt_put(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001182 return ERR_PTR(err);
Neil Hormanb6280b42009-06-22 10:18:53 +00001183 }
1184 }
1185
Neil Hormanb6280b42009-06-22 10:18:53 +00001186 goto skip_hashing;
Neil Horman1080d702008-10-27 12:28:25 -07001187 }
1188
Linus Torvalds1da177e2005-04-16 15:20:36 -07001189 rthp = &rt_hash_table[hash].chain;
1190
Eric Dumazet22c047c2005-07-05 14:55:24 -07001191 spin_lock_bh(rt_hash_lock_addr(hash));
Eric Dumazet1c317202010-10-25 21:02:07 +00001192 while ((rth = rcu_dereference_protected(*rthp,
1193 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001194 if (rt_is_expired(rth)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001195 *rthp = rth->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001196 rt_free(rth);
1197 continue;
1198 }
David S. Miller5e2b61f2011-03-04 21:47:09 -08001199 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001200 /* Put it first */
Changli Gaod8d1f302010-06-10 23:31:35 -07001201 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001202 /*
1203 * Since lookup is lockfree, the deletion
1204 * must be visible to another weakly ordered CPU before
1205 * the insertion at the start of the hash chain.
1206 */
Changli Gaod8d1f302010-06-10 23:31:35 -07001207 rcu_assign_pointer(rth->dst.rt_next,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001208 rt_hash_table[hash].chain);
1209 /*
1210 * Since lookup is lockfree, the update writes
1211 * must be ordered for consistency on SMP.
1212 */
1213 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1214
Changli Gaod8d1f302010-06-10 23:31:35 -07001215 dst_use(&rth->dst, now);
Eric Dumazet22c047c2005-07-05 14:55:24 -07001216 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001217
1218 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001219 if (skb)
Changli Gaod8d1f302010-06-10 23:31:35 -07001220 skb_dst_set(skb, &rth->dst);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001221 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001222 }
1223
Changli Gaod8d1f302010-06-10 23:31:35 -07001224 if (!atomic_read(&rth->dst.__refcnt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001225 u32 score = rt_score(rth);
1226
1227 if (score <= min_score) {
1228 cand = rth;
1229 candp = rthp;
1230 min_score = score;
1231 }
1232 }
1233
1234 chain_length++;
1235
Changli Gaod8d1f302010-06-10 23:31:35 -07001236 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001237 }
1238
1239 if (cand) {
1240 /* ip_rt_gc_elasticity used to be average length of chain
1241 * length, when exceeded gc becomes really aggressive.
1242 *
1243 * The second limit is less certain. At the moment it allows
1244 * only 2 entries per bucket. We will see.
1245 */
1246 if (chain_length > ip_rt_gc_elasticity) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001247 *candp = cand->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001248 rt_free(cand);
1249 }
Neil Horman1080d702008-10-27 12:28:25 -07001250 } else {
Eric Dumazet98376382010-03-08 03:20:00 +00001251 if (chain_length > rt_chain_length_max &&
1252 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001253 struct net *net = dev_net(rt->dst.dev);
Neil Horman1080d702008-10-27 12:28:25 -07001254 int num = ++net->ipv4.current_rt_cache_rebuild_count;
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001255 if (!rt_caching(net)) {
Neil Horman1080d702008-10-27 12:28:25 -07001256 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
Changli Gaod8d1f302010-06-10 23:31:35 -07001257 rt->dst.dev->name, num);
Neil Horman1080d702008-10-27 12:28:25 -07001258 }
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001259 rt_emergency_hash_rebuild(net);
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001260 spin_unlock_bh(rt_hash_lock_addr(hash));
1261
David S. Miller5e2b61f2011-03-04 21:47:09 -08001262 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001263 ifindex, rt_genid(net));
1264 goto restart;
Neil Horman1080d702008-10-27 12:28:25 -07001265 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001266 }
1267
1268 /* Try to bind route to arp only if it is output
1269 route or unicast forwarding path.
1270 */
David S. Millerc7537962010-11-11 17:07:48 -08001271 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
David Miller3769cff2011-07-11 22:44:24 +00001272 int err = rt_bind_neighbour(rt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001273 if (err) {
Eric Dumazet22c047c2005-07-05 14:55:24 -07001274 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001275
1276 if (err != -ENOBUFS) {
1277 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001278 return ERR_PTR(err);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001279 }
1280
1281 /* Neighbour tables are full and nothing
1282 can be released. Try to shrink route cache,
1283 it is most likely it holds some neighbour records.
1284 */
1285 if (attempts-- > 0) {
1286 int saved_elasticity = ip_rt_gc_elasticity;
1287 int saved_int = ip_rt_gc_min_interval;
1288 ip_rt_gc_elasticity = 1;
1289 ip_rt_gc_min_interval = 0;
Daniel Lezcano569d3642008-01-18 03:56:57 -08001290 rt_garbage_collect(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001291 ip_rt_gc_min_interval = saved_int;
1292 ip_rt_gc_elasticity = saved_elasticity;
1293 goto restart;
1294 }
1295
1296 if (net_ratelimit())
Ulrich Weber7e1b33e2010-09-27 15:02:18 -07001297 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07001298 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001299 return ERR_PTR(-ENOBUFS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001300 }
1301 }
1302
Changli Gaod8d1f302010-06-10 23:31:35 -07001303 rt->dst.rt_next = rt_hash_table[hash].chain;
Neil Horman1080d702008-10-27 12:28:25 -07001304
Eric Dumazet00269b52008-10-16 14:18:29 -07001305 /*
1306 * Since lookup is lockfree, we must make sure
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001307 * previous writes to rt are committed to memory
Eric Dumazet00269b52008-10-16 14:18:29 -07001308 * before making rt visible to other CPUS.
1309 */
Eric Dumazet1ddbcb02009-05-19 20:14:28 +00001310 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
Neil Horman1080d702008-10-27 12:28:25 -07001311
Eric Dumazet22c047c2005-07-05 14:55:24 -07001312 spin_unlock_bh(rt_hash_lock_addr(hash));
Neil Horman73e42892009-06-20 01:15:16 -07001313
Neil Hormanb6280b42009-06-22 10:18:53 +00001314skip_hashing:
David S. Millerb23dd4f2011-03-02 14:31:35 -08001315 if (skb)
Changli Gaod8d1f302010-06-10 23:31:35 -07001316 skb_dst_set(skb, &rt->dst);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001317 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001318}
1319
David S. Miller6431cbc2011-02-07 20:38:06 -08001320static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1321
1322static u32 rt_peer_genid(void)
1323{
1324 return atomic_read(&__rt_peer_genid);
1325}
1326
David S. Millera48eff12011-05-18 18:42:43 -04001327void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001328{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001329 struct inet_peer *peer;
1330
David S. Millera48eff12011-05-18 18:42:43 -04001331 peer = inet_getpeer_v4(daddr, create);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001332
Eric Dumazet49e8ab02010-08-19 06:10:45 +00001333 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001334 inet_putpeer(peer);
David S. Miller6431cbc2011-02-07 20:38:06 -08001335 else
1336 rt->rt_peer_genid = rt_peer_genid();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001337}
1338
1339/*
1340 * Peer allocation may fail only in serious out-of-memory conditions. However
1341 * we still can generate some output.
1342 * Random ID selection looks a bit dangerous because we have no chances to
1343 * select ID being unique in a reasonable period of time.
1344 * But broken packet identifier may be better than no packet at all.
1345 */
1346static void ip_select_fb_ident(struct iphdr *iph)
1347{
1348 static DEFINE_SPINLOCK(ip_fb_id_lock);
1349 static u32 ip_fallback_id;
1350 u32 salt;
1351
1352 spin_lock_bh(&ip_fb_id_lock);
Al Viroe4485152006-09-26 22:15:01 -07001353 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001354 iph->id = htons(salt & 0xFFFF);
1355 ip_fallback_id = salt;
1356 spin_unlock_bh(&ip_fb_id_lock);
1357}
1358
1359void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1360{
1361 struct rtable *rt = (struct rtable *) dst;
1362
Eric Dumazete688a602011-12-22 04:15:53 +00001363 if (rt && !(rt->dst.flags & DST_NOPEER)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001364 if (rt->peer == NULL)
David S. Millera48eff12011-05-18 18:42:43 -04001365 rt_bind_peer(rt, rt->rt_dst, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001366
1367 /* If peer is attached to destination, it is never detached,
1368 so that we need not to grab a lock to dereference it.
1369 */
1370 if (rt->peer) {
1371 iph->id = htons(inet_getid(rt->peer, more));
1372 return;
1373 }
Eric Dumazete688a602011-12-22 04:15:53 +00001374 } else if (!rt)
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001375 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
Stephen Hemminger9c2b3322005-04-19 22:39:42 -07001376 __builtin_return_address(0));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001377
1378 ip_select_fb_ident(iph);
1379}
Eric Dumazet4bc2f182010-07-09 21:22:10 +00001380EXPORT_SYMBOL(__ip_select_ident);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001381
1382static void rt_del(unsigned hash, struct rtable *rt)
1383{
Eric Dumazet1c317202010-10-25 21:02:07 +00001384 struct rtable __rcu **rthp;
1385 struct rtable *aux;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001386
Eric Dumazet29e75252008-01-31 17:05:09 -08001387 rthp = &rt_hash_table[hash].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001388 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001389 ip_rt_put(rt);
Eric Dumazet1c317202010-10-25 21:02:07 +00001390 while ((aux = rcu_dereference_protected(*rthp,
1391 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001392 if (aux == rt || rt_is_expired(aux)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001393 *rthp = aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001394 rt_free(aux);
1395 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001396 }
Changli Gaod8d1f302010-06-10 23:31:35 -07001397 rthp = &aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001398 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001399 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001400}
1401
David S. Millerde398fb2011-12-05 13:21:42 -05001402static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001403{
1404 struct rtable *rt = (struct rtable *) dst;
1405 __be32 orig_gw = rt->rt_gateway;
1406 struct neighbour *n, *old_n;
1407
1408 dst_confirm(&rt->dst);
1409
1410 rt->rt_gateway = peer->redirect_learned.a4;
1411
1412 n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
David S. Millerde398fb2011-12-05 13:21:42 -05001413 if (IS_ERR(n)) {
1414 rt->rt_gateway = orig_gw;
1415 return;
1416 }
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001417 old_n = xchg(&rt->dst._neighbour, n);
1418 if (old_n)
1419 neigh_release(old_n);
David S. Millerde398fb2011-12-05 13:21:42 -05001420 if (!(n->nud_state & NUD_VALID)) {
1421 neigh_event_send(n, NULL);
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001422 } else {
1423 rt->rt_flags |= RTCF_REDIRECTED;
1424 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1425 }
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001426}
1427
Eric Dumazeted7865a42010-06-07 21:49:44 -07001428/* called in rcu_read_lock() section */
Al Virof7655222006-09-26 21:25:43 -07001429void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1430 __be32 saddr, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001431{
Flavio Leitner7cc91502011-10-24 02:56:38 -04001432 int s, i;
Eric Dumazeted7865a42010-06-07 21:49:44 -07001433 struct in_device *in_dev = __in_dev_get_rcu(dev);
Flavio Leitner7cc91502011-10-24 02:56:38 -04001434 __be32 skeys[2] = { saddr, 0 };
1435 int ikeys[2] = { dev->ifindex, 0 };
David S. Millerf39925d2011-02-09 22:00:16 -08001436 struct inet_peer *peer;
Denis V. Lunev317805b2008-02-28 20:50:06 -08001437 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001438
Linus Torvalds1da177e2005-04-16 15:20:36 -07001439 if (!in_dev)
1440 return;
1441
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001442 net = dev_net(dev);
Joe Perches9d4fb272009-11-23 10:41:23 -08001443 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1444 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1445 ipv4_is_zeronet(new_gw))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001446 goto reject_redirect;
1447
1448 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1449 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1450 goto reject_redirect;
1451 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1452 goto reject_redirect;
1453 } else {
Denis V. Lunev317805b2008-02-28 20:50:06 -08001454 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001455 goto reject_redirect;
1456 }
1457
Flavio Leitner7cc91502011-10-24 02:56:38 -04001458 for (s = 0; s < 2; s++) {
1459 for (i = 0; i < 2; i++) {
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001460 unsigned int hash;
1461 struct rtable __rcu **rthp;
1462 struct rtable *rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001463
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001464 hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1465
1466 rthp = &rt_hash_table[hash].chain;
1467
1468 while ((rt = rcu_dereference(*rthp)) != NULL) {
1469 rthp = &rt->dst.rt_next;
1470
1471 if (rt->rt_key_dst != daddr ||
1472 rt->rt_key_src != skeys[s] ||
1473 rt->rt_oif != ikeys[i] ||
1474 rt_is_input_route(rt) ||
1475 rt_is_expired(rt) ||
1476 !net_eq(dev_net(rt->dst.dev), net) ||
1477 rt->dst.error ||
1478 rt->dst.dev != dev ||
1479 rt->rt_gateway != old_gw)
1480 continue;
1481
1482 if (!rt->peer)
1483 rt_bind_peer(rt, rt->rt_dst, 1);
1484
1485 peer = rt->peer;
1486 if (peer) {
Steffen Klassertac3f48d2012-03-06 21:21:10 +00001487 if (peer->redirect_learned.a4 != new_gw) {
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001488 peer->redirect_learned.a4 = new_gw;
1489 atomic_inc(&__rt_peer_genid);
1490 }
1491 check_peer_redir(&rt->dst, peer);
1492 }
Flavio Leitner7cc91502011-10-24 02:56:38 -04001493 }
Flavio Leitner7cc91502011-10-24 02:56:38 -04001494 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001495 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001496 return;
1497
1498reject_redirect:
1499#ifdef CONFIG_IP_ROUTE_VERBOSE
1500 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07001501 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1502 " Advised path = %pI4 -> %pI4\n",
1503 &old_gw, dev->name, &new_gw,
1504 &saddr, &daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001505#endif
Eric Dumazeted7865a42010-06-07 21:49:44 -07001506 ;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001507}
1508
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001509static bool peer_pmtu_expired(struct inet_peer *peer)
1510{
1511 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1512
1513 return orig &&
1514 time_after_eq(jiffies, orig) &&
1515 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1516}
1517
1518static bool peer_pmtu_cleaned(struct inet_peer *peer)
1519{
1520 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1521
1522 return orig &&
1523 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1524}
1525
Linus Torvalds1da177e2005-04-16 15:20:36 -07001526static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1527{
Eric Dumazetee6b9672008-03-05 18:30:47 -08001528 struct rtable *rt = (struct rtable *)dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001529 struct dst_entry *ret = dst;
1530
1531 if (rt) {
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001532 if (dst->obsolete > 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001533 ip_rt_put(rt);
1534 ret = NULL;
David S. Miller2c8cec52011-02-09 20:42:07 -08001535 } else if (rt->rt_flags & RTCF_REDIRECTED) {
David S. Miller5e2b61f2011-03-04 21:47:09 -08001536 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1537 rt->rt_oif,
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001538 rt_genid(dev_net(dst->dev)));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001539 rt_del(hash, rt);
1540 ret = NULL;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001541 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1542 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001543 }
1544 }
1545 return ret;
1546}
1547
1548/*
1549 * Algorithm:
1550 * 1. The first ip_rt_redirect_number redirects are sent
1551 * with exponential backoff, then we stop sending them at all,
1552 * assuming that the host ignores our redirects.
1553 * 2. If we did not see packets requiring redirects
1554 * during ip_rt_redirect_silence, we assume that the host
1555 * forgot redirected route and start to send redirects again.
1556 *
1557 * This algorithm is much cheaper and more intelligent than dumb load limiting
1558 * in icmp.c.
1559 *
1560 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1561 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1562 */
1563
1564void ip_rt_send_redirect(struct sk_buff *skb)
1565{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001566 struct rtable *rt = skb_rtable(skb);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001567 struct in_device *in_dev;
David S. Miller92d86822011-02-04 15:55:25 -08001568 struct inet_peer *peer;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001569 int log_martians;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001570
Eric Dumazet30038fc2009-08-28 23:52:01 -07001571 rcu_read_lock();
Changli Gaod8d1f302010-06-10 23:31:35 -07001572 in_dev = __in_dev_get_rcu(rt->dst.dev);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001573 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1574 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001575 return;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001576 }
1577 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1578 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001579
David S. Miller92d86822011-02-04 15:55:25 -08001580 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001581 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller92d86822011-02-04 15:55:25 -08001582 peer = rt->peer;
1583 if (!peer) {
1584 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1585 return;
1586 }
1587
Linus Torvalds1da177e2005-04-16 15:20:36 -07001588 /* No redirected packets during ip_rt_redirect_silence;
1589 * reset the algorithm.
1590 */
David S. Miller92d86822011-02-04 15:55:25 -08001591 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1592 peer->rate_tokens = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001593
1594 /* Too many ignored redirects; do not send anything
Changli Gaod8d1f302010-06-10 23:31:35 -07001595 * set dst.rate_last to the last seen redirected packet.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001596 */
David S. Miller92d86822011-02-04 15:55:25 -08001597 if (peer->rate_tokens >= ip_rt_redirect_number) {
1598 peer->rate_last = jiffies;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001599 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001600 }
1601
1602 /* Check for load limit; set rate_last to the latest sent
1603 * redirect.
1604 */
David S. Miller92d86822011-02-04 15:55:25 -08001605 if (peer->rate_tokens == 0 ||
Li Yewang14fb8a72006-12-18 00:26:35 -08001606 time_after(jiffies,
David S. Miller92d86822011-02-04 15:55:25 -08001607 (peer->rate_last +
1608 (ip_rt_redirect_load << peer->rate_tokens)))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001609 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
David S. Miller92d86822011-02-04 15:55:25 -08001610 peer->rate_last = jiffies;
1611 ++peer->rate_tokens;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001612#ifdef CONFIG_IP_ROUTE_VERBOSE
Eric Dumazet30038fc2009-08-28 23:52:01 -07001613 if (log_martians &&
David S. Miller92d86822011-02-04 15:55:25 -08001614 peer->rate_tokens == ip_rt_redirect_number &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001615 net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07001616 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
David S. Millerc5be24f2011-05-13 18:01:21 -04001617 &ip_hdr(skb)->saddr, rt->rt_iif,
Harvey Harrison673d57e2008-10-31 00:53:57 -07001618 &rt->rt_dst, &rt->rt_gateway);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001619#endif
1620 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001621}
1622
1623static int ip_error(struct sk_buff *skb)
1624{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001625 struct rtable *rt = skb_rtable(skb);
David S. Miller92d86822011-02-04 15:55:25 -08001626 struct inet_peer *peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001627 unsigned long now;
David S. Miller92d86822011-02-04 15:55:25 -08001628 bool send;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001629 int code;
1630
Changli Gaod8d1f302010-06-10 23:31:35 -07001631 switch (rt->dst.error) {
Joe Perches4500ebf2011-07-01 09:43:07 +00001632 case EINVAL:
1633 default:
1634 goto out;
1635 case EHOSTUNREACH:
1636 code = ICMP_HOST_UNREACH;
1637 break;
1638 case ENETUNREACH:
1639 code = ICMP_NET_UNREACH;
1640 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1641 IPSTATS_MIB_INNOROUTES);
1642 break;
1643 case EACCES:
1644 code = ICMP_PKT_FILTERED;
1645 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001646 }
1647
David S. Miller92d86822011-02-04 15:55:25 -08001648 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001649 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller92d86822011-02-04 15:55:25 -08001650 peer = rt->peer;
1651
1652 send = true;
1653 if (peer) {
1654 now = jiffies;
1655 peer->rate_tokens += now - peer->rate_last;
1656 if (peer->rate_tokens > ip_rt_error_burst)
1657 peer->rate_tokens = ip_rt_error_burst;
1658 peer->rate_last = now;
1659 if (peer->rate_tokens >= ip_rt_error_cost)
1660 peer->rate_tokens -= ip_rt_error_cost;
1661 else
1662 send = false;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001663 }
David S. Miller92d86822011-02-04 15:55:25 -08001664 if (send)
1665 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001666
1667out: kfree_skb(skb);
1668 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001669}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001670
1671/*
1672 * The last two values are not from the RFC but
1673 * are needed for AMPRnet AX.25 paths.
1674 */
1675
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001676static const unsigned short mtu_plateau[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -07001677{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1678
Stephen Hemminger5969f712008-04-10 01:52:09 -07001679static inline unsigned short guess_mtu(unsigned short old_mtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001680{
1681 int i;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001682
Linus Torvalds1da177e2005-04-16 15:20:36 -07001683 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1684 if (old_mtu > mtu_plateau[i])
1685 return mtu_plateau[i];
1686 return 68;
1687}
1688
Eric Dumazetb71d1d42011-04-22 04:53:02 +00001689unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
Timo Teras0010e462008-04-29 03:32:25 -07001690 unsigned short new_mtu,
1691 struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001692{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001693 unsigned short old_mtu = ntohs(iph->tot_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001694 unsigned short est_mtu = 0;
David S. Miller2c8cec52011-02-09 20:42:07 -08001695 struct inet_peer *peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001696
David S. Miller2c8cec52011-02-09 20:42:07 -08001697 peer = inet_getpeer_v4(iph->daddr, 1);
1698 if (peer) {
1699 unsigned short mtu = new_mtu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001700
David S. Miller2c8cec52011-02-09 20:42:07 -08001701 if (new_mtu < 68 || new_mtu >= old_mtu) {
1702 /* BSD 4.2 derived systems incorrectly adjust
1703 * tot_len by the IP header length, and report
1704 * a zero MTU in the ICMP message.
1705 */
1706 if (mtu == 0 &&
1707 old_mtu >= 68 + (iph->ihl << 2))
1708 old_mtu -= iph->ihl << 2;
1709 mtu = guess_mtu(old_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001710 }
David S. Miller2c8cec52011-02-09 20:42:07 -08001711
1712 if (mtu < ip_rt_min_pmtu)
1713 mtu = ip_rt_min_pmtu;
1714 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001715 unsigned long pmtu_expires;
1716
1717 pmtu_expires = jiffies + ip_rt_mtu_expires;
1718 if (!pmtu_expires)
1719 pmtu_expires = 1UL;
1720
David S. Miller2c8cec52011-02-09 20:42:07 -08001721 est_mtu = mtu;
1722 peer->pmtu_learned = mtu;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001723 peer->pmtu_expires = pmtu_expires;
Gao feng59445b62011-10-19 15:34:09 +00001724 atomic_inc(&__rt_peer_genid);
David S. Miller2c8cec52011-02-09 20:42:07 -08001725 }
1726
1727 inet_putpeer(peer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001728 }
1729 return est_mtu ? : new_mtu;
1730}
1731
David S. Miller2c8cec52011-02-09 20:42:07 -08001732static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1733{
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001734 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
David S. Miller2c8cec52011-02-09 20:42:07 -08001735
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001736 if (!expires)
1737 return;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001738 if (time_before(jiffies, expires)) {
David S. Miller2c8cec52011-02-09 20:42:07 -08001739 u32 orig_dst_mtu = dst_mtu(dst);
1740 if (peer->pmtu_learned < orig_dst_mtu) {
1741 if (!peer->pmtu_orig)
1742 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1743 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1744 }
1745 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1746 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1747}
1748
Linus Torvalds1da177e2005-04-16 15:20:36 -07001749static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1750{
David S. Miller2c8cec52011-02-09 20:42:07 -08001751 struct rtable *rt = (struct rtable *) dst;
1752 struct inet_peer *peer;
1753
1754 dst_confirm(dst);
1755
1756 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001757 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller2c8cec52011-02-09 20:42:07 -08001758 peer = rt->peer;
1759 if (peer) {
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001760 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1761
David S. Miller2c8cec52011-02-09 20:42:07 -08001762 if (mtu < ip_rt_min_pmtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001763 mtu = ip_rt_min_pmtu;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001764 if (!pmtu_expires || mtu < peer->pmtu_learned) {
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001765
1766 pmtu_expires = jiffies + ip_rt_mtu_expires;
1767 if (!pmtu_expires)
1768 pmtu_expires = 1UL;
1769
David S. Miller2c8cec52011-02-09 20:42:07 -08001770 peer->pmtu_learned = mtu;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001771 peer->pmtu_expires = pmtu_expires;
David S. Miller2c8cec52011-02-09 20:42:07 -08001772
1773 atomic_inc(&__rt_peer_genid);
1774 rt->rt_peer_genid = rt_peer_genid();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001775 }
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001776 check_peer_pmtu(dst, peer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001777 }
1778}
1779
David S. Millerf39925d2011-02-09 22:00:16 -08001780
David S. Millerde398fb2011-12-05 13:21:42 -05001781static void ipv4_validate_peer(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001782{
David S. Miller6431cbc2011-02-07 20:38:06 -08001783 if (rt->rt_peer_genid != rt_peer_genid()) {
David S. Miller2c8cec52011-02-09 20:42:07 -08001784 struct inet_peer *peer;
1785
David S. Miller6431cbc2011-02-07 20:38:06 -08001786 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001787 rt_bind_peer(rt, rt->rt_dst, 0);
David S. Miller6431cbc2011-02-07 20:38:06 -08001788
David S. Miller2c8cec52011-02-09 20:42:07 -08001789 peer = rt->peer;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001790 if (peer) {
David S. Millerefbc3682011-12-01 13:38:59 -05001791 check_peer_pmtu(&rt->dst, peer);
David S. Miller2c8cec52011-02-09 20:42:07 -08001792
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001793 if (peer->redirect_learned.a4 &&
David S. Millerde398fb2011-12-05 13:21:42 -05001794 peer->redirect_learned.a4 != rt->rt_gateway)
1795 check_peer_redir(&rt->dst, peer);
David S. Millerf39925d2011-02-09 22:00:16 -08001796 }
1797
David S. Miller6431cbc2011-02-07 20:38:06 -08001798 rt->rt_peer_genid = rt_peer_genid();
1799 }
David S. Millerefbc3682011-12-01 13:38:59 -05001800}
1801
1802static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1803{
1804 struct rtable *rt = (struct rtable *) dst;
1805
1806 if (rt_is_expired(rt))
1807 return NULL;
David S. Millerde398fb2011-12-05 13:21:42 -05001808 ipv4_validate_peer(rt);
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001809 return dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001810}
1811
1812static void ipv4_dst_destroy(struct dst_entry *dst)
1813{
1814 struct rtable *rt = (struct rtable *) dst;
1815 struct inet_peer *peer = rt->peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001816
David S. Miller62fa8a82011-01-26 20:51:05 -08001817 if (rt->fi) {
1818 fib_info_put(rt->fi);
1819 rt->fi = NULL;
1820 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001821 if (peer) {
1822 rt->peer = NULL;
1823 inet_putpeer(peer);
1824 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001825}
1826
Linus Torvalds1da177e2005-04-16 15:20:36 -07001827
1828static void ipv4_link_failure(struct sk_buff *skb)
1829{
1830 struct rtable *rt;
1831
1832 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1833
Eric Dumazet511c3f92009-06-02 05:14:27 +00001834 rt = skb_rtable(skb);
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001835 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1836 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001837}
1838
1839static int ip_rt_bug(struct sk_buff *skb)
1840{
Harvey Harrison673d57e2008-10-31 00:53:57 -07001841 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1842 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001843 skb->dev ? skb->dev->name : "?");
1844 kfree_skb(skb);
Dave Jonesc378a9c2011-05-21 07:16:42 +00001845 WARN_ON(1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001846 return 0;
1847}
1848
1849/*
1850 We do not cache source address of outgoing interface,
1851 because it is used only by IP RR, TS and SRR options,
1852 so that it out of fast path.
1853
1854 BTW remember: "addr" is allowed to be not aligned
1855 in IP options!
1856 */
1857
David S. Miller8e363602011-05-13 17:29:41 -04001858void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001859{
Al Viroa61ced52006-09-26 21:27:54 -07001860 __be32 src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001861
David S. Millerc7537962010-11-11 17:07:48 -08001862 if (rt_is_output_route(rt))
David S. Millerc5be24f2011-05-13 18:01:21 -04001863 src = ip_hdr(skb)->saddr;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001864 else {
David S. Miller8e363602011-05-13 17:29:41 -04001865 struct fib_result res;
1866 struct flowi4 fl4;
1867 struct iphdr *iph;
1868
1869 iph = ip_hdr(skb);
1870
1871 memset(&fl4, 0, sizeof(fl4));
1872 fl4.daddr = iph->daddr;
1873 fl4.saddr = iph->saddr;
Julian Anastasovb0fe4a32011-07-23 02:00:41 +00001874 fl4.flowi4_tos = RT_TOS(iph->tos);
David S. Miller8e363602011-05-13 17:29:41 -04001875 fl4.flowi4_oif = rt->dst.dev->ifindex;
1876 fl4.flowi4_iif = skb->dev->ifindex;
1877 fl4.flowi4_mark = skb->mark;
David S. Miller5e2b61f2011-03-04 21:47:09 -08001878
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001879 rcu_read_lock();
David S. Miller68a5e3d2011-03-11 20:07:33 -05001880 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
David S. Miller436c3b62011-03-24 17:42:21 -07001881 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001882 else
1883 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001884 RT_SCOPE_UNIVERSE);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001885 rcu_read_unlock();
1886 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001887 memcpy(addr, &src, 4);
1888}
1889
Patrick McHardyc7066f72011-01-14 13:36:42 +01001890#ifdef CONFIG_IP_ROUTE_CLASSID
Linus Torvalds1da177e2005-04-16 15:20:36 -07001891static void set_class_tag(struct rtable *rt, u32 tag)
1892{
Changli Gaod8d1f302010-06-10 23:31:35 -07001893 if (!(rt->dst.tclassid & 0xFFFF))
1894 rt->dst.tclassid |= tag & 0xFFFF;
1895 if (!(rt->dst.tclassid & 0xFFFF0000))
1896 rt->dst.tclassid |= tag & 0xFFFF0000;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001897}
1898#endif
1899
David S. Miller0dbaee32010-12-13 12:52:14 -08001900static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1901{
1902 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1903
1904 if (advmss == 0) {
1905 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1906 ip_rt_min_advmss);
1907 if (advmss > 65535 - 40)
1908 advmss = 65535 - 40;
1909 }
1910 return advmss;
1911}
1912
Steffen Klassertebb762f2011-11-23 02:12:51 +00001913static unsigned int ipv4_mtu(const struct dst_entry *dst)
David S. Millerd33e4552010-12-14 13:01:14 -08001914{
Steffen Klassert261663b2011-11-23 02:14:50 +00001915 const struct rtable *rt = (const struct rtable *) dst;
Steffen Klassert618f9bc2011-11-23 02:13:31 +00001916 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1917
Steffen Klassert261663b2011-11-23 02:14:50 +00001918 if (mtu && rt_is_output_route(rt))
Steffen Klassert618f9bc2011-11-23 02:13:31 +00001919 return mtu;
1920
1921 mtu = dst->dev->mtu;
David S. Millerd33e4552010-12-14 13:01:14 -08001922
1923 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
David S. Millerd33e4552010-12-14 13:01:14 -08001924
1925 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1926 mtu = 576;
1927 }
1928
1929 if (mtu > IP_MAX_MTU)
1930 mtu = IP_MAX_MTU;
1931
1932 return mtu;
1933}
1934
David S. Miller813b3b52011-04-28 14:48:42 -07001935static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
David S. Miller5e2b61f2011-03-04 21:47:09 -08001936 struct fib_info *fi)
David S. Millera4daad62011-01-27 22:01:53 -08001937{
David S. Miller0131ba42011-02-04 14:37:30 -08001938 struct inet_peer *peer;
1939 int create = 0;
1940
1941 /* If a peer entry exists for this destination, we must hook
1942 * it up in order to get at cached metrics.
1943 */
David S. Miller813b3b52011-04-28 14:48:42 -07001944 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
David S. Miller0131ba42011-02-04 14:37:30 -08001945 create = 1;
1946
David S. Miller3c0afdc2011-03-04 21:26:07 -08001947 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
David S. Miller0131ba42011-02-04 14:37:30 -08001948 if (peer) {
David S. Miller3c0afdc2011-03-04 21:26:07 -08001949 rt->rt_peer_genid = rt_peer_genid();
David S. Miller0131ba42011-02-04 14:37:30 -08001950 if (inet_metrics_new(peer))
1951 memcpy(peer->metrics, fi->fib_metrics,
1952 sizeof(u32) * RTAX_MAX);
1953 dst_init_metrics(&rt->dst, peer->metrics, false);
David S. Miller2c8cec52011-02-09 20:42:07 -08001954
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001955 check_peer_pmtu(&rt->dst, peer);
Steffen Klassertac3f48d2012-03-06 21:21:10 +00001956
David S. Millerf39925d2011-02-09 22:00:16 -08001957 if (peer->redirect_learned.a4 &&
1958 peer->redirect_learned.a4 != rt->rt_gateway) {
1959 rt->rt_gateway = peer->redirect_learned.a4;
1960 rt->rt_flags |= RTCF_REDIRECTED;
1961 }
David S. Miller0131ba42011-02-04 14:37:30 -08001962 } else {
David S. Millerb8dad612011-01-28 14:07:16 -08001963 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1964 rt->fi = fi;
1965 atomic_inc(&fi->fib_clntref);
1966 }
David S. Millera4daad62011-01-27 22:01:53 -08001967 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
David S. Millera4daad62011-01-27 22:01:53 -08001968 }
1969}
1970
David S. Miller813b3b52011-04-28 14:48:42 -07001971static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
David S. Miller5e2b61f2011-03-04 21:47:09 -08001972 const struct fib_result *res,
David S. Miller982721f2011-02-16 21:44:24 -08001973 struct fib_info *fi, u16 type, u32 itag)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001974{
David S. Millerdefb3512010-12-08 21:16:57 -08001975 struct dst_entry *dst = &rt->dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001976
1977 if (fi) {
1978 if (FIB_RES_GW(*res) &&
1979 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1980 rt->rt_gateway = FIB_RES_GW(*res);
David S. Miller813b3b52011-04-28 14:48:42 -07001981 rt_init_metrics(rt, fl4, fi);
Patrick McHardyc7066f72011-01-14 13:36:42 +01001982#ifdef CONFIG_IP_ROUTE_CLASSID
David S. Millerdefb3512010-12-08 21:16:57 -08001983 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001984#endif
David S. Millerd33e4552010-12-14 13:01:14 -08001985 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001986
David S. Millerdefb3512010-12-08 21:16:57 -08001987 if (dst_mtu(dst) > IP_MAX_MTU)
1988 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
David S. Miller0dbaee32010-12-13 12:52:14 -08001989 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
David S. Millerdefb3512010-12-08 21:16:57 -08001990 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001991
Patrick McHardyc7066f72011-01-14 13:36:42 +01001992#ifdef CONFIG_IP_ROUTE_CLASSID
Linus Torvalds1da177e2005-04-16 15:20:36 -07001993#ifdef CONFIG_IP_MULTIPLE_TABLES
1994 set_class_tag(rt, fib_rules_tclass(res));
1995#endif
1996 set_class_tag(rt, itag);
1997#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001998}
1999
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002000static struct rtable *rt_dst_alloc(struct net_device *dev,
2001 bool nopolicy, bool noxfrm)
David S. Miller0c4dcd52011-02-17 15:42:37 -08002002{
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002003 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2004 DST_HOST |
2005 (nopolicy ? DST_NOPOLICY : 0) |
2006 (noxfrm ? DST_NOXFRM : 0));
David S. Miller0c4dcd52011-02-17 15:42:37 -08002007}
2008
Eric Dumazet96d36222010-06-02 19:21:31 +00002009/* called in rcu_read_lock() section */
Al Viro9e12bb22006-09-26 21:25:20 -07002010static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002011 u8 tos, struct net_device *dev, int our)
2012{
Eric Dumazet96d36222010-06-02 19:21:31 +00002013 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002014 struct rtable *rth;
Al Viroa61ced52006-09-26 21:27:54 -07002015 __be32 spec_dst;
Eric Dumazet96d36222010-06-02 19:21:31 +00002016 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002017 u32 itag = 0;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002018 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002019
2020 /* Primary sanity checks. */
2021
2022 if (in_dev == NULL)
2023 return -EINVAL;
2024
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002025 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002026 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002027 goto e_inval;
2028
Joe Perchesf97c1e02007-12-16 13:45:43 -08002029 if (ipv4_is_zeronet(saddr)) {
2030 if (!ipv4_is_local_multicast(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002031 goto e_inval;
2032 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002033 } else {
Michael Smith5c04c812011-04-07 04:51:50 +00002034 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2035 &itag);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002036 if (err < 0)
2037 goto e_err;
2038 }
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002039 rth = rt_dst_alloc(init_net.loopback_dev,
2040 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002041 if (!rth)
2042 goto e_nobufs;
2043
Patrick McHardyc7066f72011-01-14 13:36:42 +01002044#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07002045 rth->dst.tclassid = itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002046#endif
David S. Millercf911662011-04-28 14:31:47 -07002047 rth->dst.output = ip_rt_bug;
2048
2049 rth->rt_key_dst = daddr;
2050 rth->rt_key_src = saddr;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002051 rth->rt_genid = rt_genid(dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002052 rth->rt_flags = RTCF_MULTICAST;
Eric Dumazet29e75252008-01-31 17:05:09 -08002053 rth->rt_type = RTN_MULTICAST;
David S. Miller475949d2011-05-03 19:45:15 -07002054 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002055 rth->rt_dst = daddr;
2056 rth->rt_src = saddr;
2057 rth->rt_route_iif = dev->ifindex;
2058 rth->rt_iif = dev->ifindex;
2059 rth->rt_oif = 0;
2060 rth->rt_mark = skb->mark;
2061 rth->rt_gateway = daddr;
2062 rth->rt_spec_dst= spec_dst;
2063 rth->rt_peer_genid = 0;
2064 rth->peer = NULL;
2065 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002066 if (our) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002067 rth->dst.input= ip_local_deliver;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002068 rth->rt_flags |= RTCF_LOCAL;
2069 }
2070
2071#ifdef CONFIG_IP_MROUTE
Joe Perchesf97c1e02007-12-16 13:45:43 -08002072 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
Changli Gaod8d1f302010-06-10 23:31:35 -07002073 rth->dst.input = ip_mr_input;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002074#endif
2075 RT_CACHE_STAT_INC(in_slow_mc);
2076
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002077 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
David S. Millerb23dd4f2011-03-02 14:31:35 -08002078 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
Eric Dumazet9aa3c942011-06-18 11:59:18 -07002079 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002080
2081e_nobufs:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002082 return -ENOBUFS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002083e_inval:
Eric Dumazet96d36222010-06-02 19:21:31 +00002084 return -EINVAL;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002085e_err:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002086 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002087}
2088
2089
2090static void ip_handle_martian_source(struct net_device *dev,
2091 struct in_device *in_dev,
2092 struct sk_buff *skb,
Al Viro9e12bb22006-09-26 21:25:20 -07002093 __be32 daddr,
2094 __be32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002095{
2096 RT_CACHE_STAT_INC(in_martian_src);
2097#ifdef CONFIG_IP_ROUTE_VERBOSE
2098 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2099 /*
2100 * RFC1812 recommendation, if source is martian,
2101 * the only hint is MAC header.
2102 */
Harvey Harrison673d57e2008-10-31 00:53:57 -07002103 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2104 &daddr, &saddr, dev->name);
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07002105 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002106 int i;
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07002107 const unsigned char *p = skb_mac_header(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002108 printk(KERN_WARNING "ll header: ");
2109 for (i = 0; i < dev->hard_header_len; i++, p++) {
2110 printk("%02x", *p);
2111 if (i < (dev->hard_header_len - 1))
2112 printk(":");
2113 }
2114 printk("\n");
2115 }
2116 }
2117#endif
2118}
2119
Eric Dumazet47360222010-06-03 04:13:21 +00002120/* called in rcu_read_lock() section */
Stephen Hemminger5969f712008-04-10 01:52:09 -07002121static int __mkroute_input(struct sk_buff *skb,
David S. Miller982721f2011-02-16 21:44:24 -08002122 const struct fib_result *res,
Stephen Hemminger5969f712008-04-10 01:52:09 -07002123 struct in_device *in_dev,
2124 __be32 daddr, __be32 saddr, u32 tos,
2125 struct rtable **result)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002126{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002127 struct rtable *rth;
2128 int err;
2129 struct in_device *out_dev;
Eric Dumazet47360222010-06-03 04:13:21 +00002130 unsigned int flags = 0;
Al Virod9c9df82006-09-26 21:28:14 -07002131 __be32 spec_dst;
2132 u32 itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002133
2134 /* get a working reference to the output device */
Eric Dumazet47360222010-06-03 04:13:21 +00002135 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002136 if (out_dev == NULL) {
2137 if (net_ratelimit())
2138 printk(KERN_CRIT "Bug in ip_route_input" \
2139 "_slow(). Please, report\n");
2140 return -EINVAL;
2141 }
2142
2143
Michael Smith5c04c812011-04-07 04:51:50 +00002144 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2145 in_dev->dev, &spec_dst, &itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002146 if (err < 0) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002147 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002148 saddr);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002149
Linus Torvalds1da177e2005-04-16 15:20:36 -07002150 goto cleanup;
2151 }
2152
2153 if (err)
2154 flags |= RTCF_DIRECTSRC;
2155
Thomas Graf51b77ca2008-06-03 16:36:01 -07002156 if (out_dev == in_dev && err &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002157 (IN_DEV_SHARED_MEDIA(out_dev) ||
2158 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2159 flags |= RTCF_DOREDIRECT;
2160
2161 if (skb->protocol != htons(ETH_P_IP)) {
2162 /* Not IP (i.e. ARP). Do not create route, if it is
2163 * invalid for proxy arp. DNAT routes are always valid.
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00002164 *
2165 * Proxy arp feature have been extended to allow, ARP
2166 * replies back to the same interface, to support
2167 * Private VLAN switch technologies. See arp.c.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002168 */
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00002169 if (out_dev == in_dev &&
2170 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002171 err = -EINVAL;
2172 goto cleanup;
2173 }
2174 }
2175
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002176 rth = rt_dst_alloc(out_dev->dev,
2177 IN_DEV_CONF_GET(in_dev, NOPOLICY),
David S. Miller0c4dcd52011-02-17 15:42:37 -08002178 IN_DEV_CONF_GET(out_dev, NOXFRM));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002179 if (!rth) {
2180 err = -ENOBUFS;
2181 goto cleanup;
2182 }
2183
David S. Miller5e2b61f2011-03-04 21:47:09 -08002184 rth->rt_key_dst = daddr;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002185 rth->rt_key_src = saddr;
David S. Millercf911662011-04-28 14:31:47 -07002186 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2187 rth->rt_flags = flags;
2188 rth->rt_type = res->type;
David S. Miller475949d2011-05-03 19:45:15 -07002189 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002190 rth->rt_dst = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002191 rth->rt_src = saddr;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002192 rth->rt_route_iif = in_dev->dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002193 rth->rt_iif = in_dev->dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002194 rth->rt_oif = 0;
David S. Millercf911662011-04-28 14:31:47 -07002195 rth->rt_mark = skb->mark;
2196 rth->rt_gateway = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002197 rth->rt_spec_dst= spec_dst;
David S. Millercf911662011-04-28 14:31:47 -07002198 rth->rt_peer_genid = 0;
2199 rth->peer = NULL;
2200 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002201
Changli Gaod8d1f302010-06-10 23:31:35 -07002202 rth->dst.input = ip_forward;
2203 rth->dst.output = ip_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002204
David S. Miller5e2b61f2011-03-04 21:47:09 -08002205 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002206
Linus Torvalds1da177e2005-04-16 15:20:36 -07002207 *result = rth;
2208 err = 0;
2209 cleanup:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002210 return err;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002211}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002212
Stephen Hemminger5969f712008-04-10 01:52:09 -07002213static int ip_mkroute_input(struct sk_buff *skb,
2214 struct fib_result *res,
David S. Miller68a5e3d2011-03-11 20:07:33 -05002215 const struct flowi4 *fl4,
Stephen Hemminger5969f712008-04-10 01:52:09 -07002216 struct in_device *in_dev,
2217 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002218{
Chuck Short7abaa272005-06-22 22:10:23 -07002219 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002220 int err;
2221 unsigned hash;
2222
2223#ifdef CONFIG_IP_ROUTE_MULTIPATH
David S. Millerff3fccb2011-03-10 16:23:24 -08002224 if (res->fi && res->fi->fib_nhs > 1)
David S. Miller1b7fe5932011-03-10 17:01:16 -08002225 fib_select_multipath(res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002226#endif
2227
2228 /* create a routing cache entry */
2229 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2230 if (err)
2231 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002232
2233 /* put it into the cache */
David S. Miller68a5e3d2011-03-11 20:07:33 -05002234 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
Changli Gaod8d1f302010-06-10 23:31:35 -07002235 rt_genid(dev_net(rth->dst.dev)));
David S. Miller68a5e3d2011-03-11 20:07:33 -05002236 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002237 if (IS_ERR(rth))
2238 return PTR_ERR(rth);
2239 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002240}
2241
Linus Torvalds1da177e2005-04-16 15:20:36 -07002242/*
2243 * NOTE. We drop all the packets that has local source
2244 * addresses, because every properly looped back packet
2245 * must have correct destination already attached by output routine.
2246 *
2247 * Such approach solves two big problems:
2248 * 1. Not simplex devices are handled properly.
2249 * 2. IP spoofing attempts are filtered with 100% of guarantee.
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002250 * called with rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -07002251 */
2252
Al Viro9e12bb22006-09-26 21:25:20 -07002253static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002254 u8 tos, struct net_device *dev)
2255{
2256 struct fib_result res;
Eric Dumazet96d36222010-06-02 19:21:31 +00002257 struct in_device *in_dev = __in_dev_get_rcu(dev);
David S. Miller68a5e3d2011-03-11 20:07:33 -05002258 struct flowi4 fl4;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002259 unsigned flags = 0;
2260 u32 itag = 0;
2261 struct rtable * rth;
2262 unsigned hash;
Al Viro9e12bb22006-09-26 21:25:20 -07002263 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002264 int err = -EINVAL;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002265 struct net * net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002266
2267 /* IP on this device is disabled. */
2268
2269 if (!in_dev)
2270 goto out;
2271
2272 /* Check for the most weird martians, which can be not detected
2273 by fib_lookup.
2274 */
2275
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002276 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002277 ipv4_is_loopback(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002278 goto martian_source;
2279
Andy Walls27a954b2010-10-17 15:11:22 +00002280 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002281 goto brd_input;
2282
2283 /* Accept zero addresses only to limited broadcast;
2284 * I even do not know to fix it or not. Waiting for complains :-)
2285 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002286 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002287 goto martian_source;
2288
Andy Walls27a954b2010-10-17 15:11:22 +00002289 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002290 goto martian_destination;
2291
2292 /*
2293 * Now we are ready to route packet.
2294 */
David S. Miller68a5e3d2011-03-11 20:07:33 -05002295 fl4.flowi4_oif = 0;
2296 fl4.flowi4_iif = dev->ifindex;
2297 fl4.flowi4_mark = skb->mark;
2298 fl4.flowi4_tos = tos;
2299 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2300 fl4.daddr = daddr;
2301 fl4.saddr = saddr;
2302 err = fib_lookup(net, &fl4, &res);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002303 if (err != 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002304 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002305 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002306 goto no_route;
2307 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002308
2309 RT_CACHE_STAT_INC(in_slow_tot);
2310
2311 if (res.type == RTN_BROADCAST)
2312 goto brd_input;
2313
2314 if (res.type == RTN_LOCAL) {
Michael Smith5c04c812011-04-07 04:51:50 +00002315 err = fib_validate_source(skb, saddr, daddr, tos,
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002316 net->loopback_dev->ifindex,
Michael Smith5c04c812011-04-07 04:51:50 +00002317 dev, &spec_dst, &itag);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002318 if (err < 0)
2319 goto martian_source_keep_err;
2320 if (err)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002321 flags |= RTCF_DIRECTSRC;
2322 spec_dst = daddr;
2323 goto local_input;
2324 }
2325
2326 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002327 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002328 if (res.type != RTN_UNICAST)
2329 goto martian_destination;
2330
David S. Miller68a5e3d2011-03-11 20:07:33 -05002331 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002332out: return err;
2333
2334brd_input:
2335 if (skb->protocol != htons(ETH_P_IP))
2336 goto e_inval;
2337
Joe Perchesf97c1e02007-12-16 13:45:43 -08002338 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002339 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2340 else {
Michael Smith5c04c812011-04-07 04:51:50 +00002341 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2342 &itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002343 if (err < 0)
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002344 goto martian_source_keep_err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002345 if (err)
2346 flags |= RTCF_DIRECTSRC;
2347 }
2348 flags |= RTCF_BROADCAST;
2349 res.type = RTN_BROADCAST;
2350 RT_CACHE_STAT_INC(in_brd);
2351
2352local_input:
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002353 rth = rt_dst_alloc(net->loopback_dev,
2354 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002355 if (!rth)
2356 goto e_nobufs;
2357
David S. Millercf911662011-04-28 14:31:47 -07002358 rth->dst.input= ip_local_deliver;
Changli Gaod8d1f302010-06-10 23:31:35 -07002359 rth->dst.output= ip_rt_bug;
David S. Millercf911662011-04-28 14:31:47 -07002360#ifdef CONFIG_IP_ROUTE_CLASSID
2361 rth->dst.tclassid = itag;
2362#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002363
David S. Miller5e2b61f2011-03-04 21:47:09 -08002364 rth->rt_key_dst = daddr;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002365 rth->rt_key_src = saddr;
David S. Millercf911662011-04-28 14:31:47 -07002366 rth->rt_genid = rt_genid(net);
2367 rth->rt_flags = flags|RTCF_LOCAL;
2368 rth->rt_type = res.type;
David S. Miller475949d2011-05-03 19:45:15 -07002369 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002370 rth->rt_dst = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002371 rth->rt_src = saddr;
Patrick McHardyc7066f72011-01-14 13:36:42 +01002372#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07002373 rth->dst.tclassid = itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002374#endif
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002375 rth->rt_route_iif = dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002376 rth->rt_iif = dev->ifindex;
David S. Millercf911662011-04-28 14:31:47 -07002377 rth->rt_oif = 0;
2378 rth->rt_mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002379 rth->rt_gateway = daddr;
2380 rth->rt_spec_dst= spec_dst;
David S. Millercf911662011-04-28 14:31:47 -07002381 rth->rt_peer_genid = 0;
2382 rth->peer = NULL;
2383 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002384 if (res.type == RTN_UNREACHABLE) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002385 rth->dst.input= ip_error;
2386 rth->dst.error= -err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002387 rth->rt_flags &= ~RTCF_LOCAL;
2388 }
David S. Miller68a5e3d2011-03-11 20:07:33 -05002389 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2390 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002391 err = 0;
2392 if (IS_ERR(rth))
2393 err = PTR_ERR(rth);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002394 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002395
2396no_route:
2397 RT_CACHE_STAT_INC(in_no_route);
2398 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2399 res.type = RTN_UNREACHABLE;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08002400 if (err == -ESRCH)
2401 err = -ENETUNREACH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002402 goto local_input;
2403
2404 /*
2405 * Do not cache martian addresses: they should be logged (RFC1812)
2406 */
2407martian_destination:
2408 RT_CACHE_STAT_INC(in_martian_dst);
2409#ifdef CONFIG_IP_ROUTE_VERBOSE
2410 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07002411 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2412 &daddr, &saddr, dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002413#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002414
2415e_hostunreach:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002416 err = -EHOSTUNREACH;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002417 goto out;
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002418
Linus Torvalds1da177e2005-04-16 15:20:36 -07002419e_inval:
2420 err = -EINVAL;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002421 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002422
2423e_nobufs:
2424 err = -ENOBUFS;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002425 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002426
2427martian_source:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002428 err = -EINVAL;
2429martian_source_keep_err:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002430 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002431 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002432}
2433
Eric Dumazet407eadd2010-05-10 11:32:55 +00002434int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2435 u8 tos, struct net_device *dev, bool noref)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002436{
2437 struct rtable * rth;
2438 unsigned hash;
2439 int iif = dev->ifindex;
Denis V. Lunevb5921912008-01-22 23:50:25 -08002440 struct net *net;
Eric Dumazet96d36222010-06-02 19:21:31 +00002441 int res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002442
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002443 net = dev_net(dev);
Neil Horman1080d702008-10-27 12:28:25 -07002444
Eric Dumazet96d36222010-06-02 19:21:31 +00002445 rcu_read_lock();
2446
Neil Horman1080d702008-10-27 12:28:25 -07002447 if (!rt_caching(net))
2448 goto skip_cache;
2449
Linus Torvalds1da177e2005-04-16 15:20:36 -07002450 tos &= IPTOS_RT_MASK;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002451 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002452
Linus Torvalds1da177e2005-04-16 15:20:36 -07002453 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002454 rth = rcu_dereference(rth->dst.rt_next)) {
David S. Miller5e2b61f2011-03-04 21:47:09 -08002455 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2456 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
Julian Anastasov97a80412011-08-09 04:01:16 +00002457 (rth->rt_route_iif ^ iif) |
David S. Miller475949d2011-05-03 19:45:15 -07002458 (rth->rt_key_tos ^ tos)) == 0 &&
David S. Miller5e2b61f2011-03-04 21:47:09 -08002459 rth->rt_mark == skb->mark &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002460 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002461 !rt_is_expired(rth)) {
David S. Millerde398fb2011-12-05 13:21:42 -05002462 ipv4_validate_peer(rth);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002463 if (noref) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002464 dst_use_noref(&rth->dst, jiffies);
2465 skb_dst_set_noref(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002466 } else {
Changli Gaod8d1f302010-06-10 23:31:35 -07002467 dst_use(&rth->dst, jiffies);
2468 skb_dst_set(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002469 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002470 RT_CACHE_STAT_INC(in_hit);
2471 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002472 return 0;
2473 }
2474 RT_CACHE_STAT_INC(in_hlist_search);
2475 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002476
Neil Horman1080d702008-10-27 12:28:25 -07002477skip_cache:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002478 /* Multicast recognition logic is moved from route cache to here.
2479 The problem was that too many Ethernet cards have broken/missing
2480 hardware multicast filters :-( As result the host on multicasting
2481 network acquires a lot of useless route cache entries, sort of
2482 SDR messages from all the world. Now we try to get rid of them.
2483 Really, provided software IP multicast filter is organized
2484 reasonably (at least, hashed), it does not result in a slowdown
2485 comparing with route cache reject entries.
2486 Note, that multicast routers are not affected, because
2487 route cache entry is created eventually.
2488 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002489 if (ipv4_is_multicast(daddr)) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002490 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002491
Eric Dumazet96d36222010-06-02 19:21:31 +00002492 if (in_dev) {
David S. Millerdbdd9a52011-03-10 16:34:38 -08002493 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2494 ip_hdr(skb)->protocol);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002495 if (our
2496#ifdef CONFIG_IP_MROUTE
Joe Perches9d4fb272009-11-23 10:41:23 -08002497 ||
2498 (!ipv4_is_local_multicast(daddr) &&
2499 IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002500#endif
Joe Perches9d4fb272009-11-23 10:41:23 -08002501 ) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002502 int res = ip_route_input_mc(skb, daddr, saddr,
2503 tos, dev, our);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002504 rcu_read_unlock();
Eric Dumazet96d36222010-06-02 19:21:31 +00002505 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002506 }
2507 }
2508 rcu_read_unlock();
2509 return -EINVAL;
2510 }
Eric Dumazet96d36222010-06-02 19:21:31 +00002511 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2512 rcu_read_unlock();
2513 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002514}
Eric Dumazet407eadd2010-05-10 11:32:55 +00002515EXPORT_SYMBOL(ip_route_input_common);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002516
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002517/* called with rcu_read_lock() */
David S. Miller982721f2011-02-16 21:44:24 -08002518static struct rtable *__mkroute_output(const struct fib_result *res,
David S. Miller68a5e3d2011-03-11 20:07:33 -05002519 const struct flowi4 *fl4,
David S. Miller813b3b52011-04-28 14:48:42 -07002520 __be32 orig_daddr, __be32 orig_saddr,
Julian Anastasovf61759e2011-12-02 11:39:42 +00002521 int orig_oif, __u8 orig_rtos,
2522 struct net_device *dev_out,
David S. Miller5ada5522011-02-17 15:29:00 -08002523 unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002524{
David S. Miller982721f2011-02-16 21:44:24 -08002525 struct fib_info *fi = res->fi;
David S. Miller5ada5522011-02-17 15:29:00 -08002526 struct in_device *in_dev;
David S. Miller982721f2011-02-16 21:44:24 -08002527 u16 type = res->type;
David S. Miller5ada5522011-02-17 15:29:00 -08002528 struct rtable *rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002529
David S. Miller68a5e3d2011-03-11 20:07:33 -05002530 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
David S. Miller5ada5522011-02-17 15:29:00 -08002531 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002532
David S. Miller68a5e3d2011-03-11 20:07:33 -05002533 if (ipv4_is_lbcast(fl4->daddr))
David S. Miller982721f2011-02-16 21:44:24 -08002534 type = RTN_BROADCAST;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002535 else if (ipv4_is_multicast(fl4->daddr))
David S. Miller982721f2011-02-16 21:44:24 -08002536 type = RTN_MULTICAST;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002537 else if (ipv4_is_zeronet(fl4->daddr))
David S. Miller5ada5522011-02-17 15:29:00 -08002538 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002539
2540 if (dev_out->flags & IFF_LOOPBACK)
2541 flags |= RTCF_LOCAL;
2542
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002543 in_dev = __in_dev_get_rcu(dev_out);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002544 if (!in_dev)
David S. Miller5ada5522011-02-17 15:29:00 -08002545 return ERR_PTR(-EINVAL);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002546
David S. Miller982721f2011-02-16 21:44:24 -08002547 if (type == RTN_BROADCAST) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002548 flags |= RTCF_BROADCAST | RTCF_LOCAL;
David S. Miller982721f2011-02-16 21:44:24 -08002549 fi = NULL;
2550 } else if (type == RTN_MULTICAST) {
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002551 flags |= RTCF_MULTICAST | RTCF_LOCAL;
David S. Miller813b3b52011-04-28 14:48:42 -07002552 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2553 fl4->flowi4_proto))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002554 flags &= ~RTCF_LOCAL;
2555 /* If multicast route do not exist use
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002556 * default one, but do not gateway in this case.
2557 * Yes, it is hack.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002558 */
David S. Miller982721f2011-02-16 21:44:24 -08002559 if (fi && res->prefixlen < 4)
2560 fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002561 }
2562
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002563 rth = rt_dst_alloc(dev_out,
2564 IN_DEV_CONF_GET(in_dev, NOPOLICY),
David S. Miller0c4dcd52011-02-17 15:42:37 -08002565 IN_DEV_CONF_GET(in_dev, NOXFRM));
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002566 if (!rth)
David S. Miller5ada5522011-02-17 15:29:00 -08002567 return ERR_PTR(-ENOBUFS);
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002568
David S. Millercf911662011-04-28 14:31:47 -07002569 rth->dst.output = ip_output;
2570
David S. Miller813b3b52011-04-28 14:48:42 -07002571 rth->rt_key_dst = orig_daddr;
2572 rth->rt_key_src = orig_saddr;
David S. Millercf911662011-04-28 14:31:47 -07002573 rth->rt_genid = rt_genid(dev_net(dev_out));
2574 rth->rt_flags = flags;
2575 rth->rt_type = type;
Julian Anastasovf61759e2011-12-02 11:39:42 +00002576 rth->rt_key_tos = orig_rtos;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002577 rth->rt_dst = fl4->daddr;
2578 rth->rt_src = fl4->saddr;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002579 rth->rt_route_iif = 0;
David S. Miller813b3b52011-04-28 14:48:42 -07002580 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2581 rth->rt_oif = orig_oif;
2582 rth->rt_mark = fl4->flowi4_mark;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002583 rth->rt_gateway = fl4->daddr;
2584 rth->rt_spec_dst= fl4->saddr;
David S. Millercf911662011-04-28 14:31:47 -07002585 rth->rt_peer_genid = 0;
2586 rth->peer = NULL;
2587 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002588
2589 RT_CACHE_STAT_INC(out_slow_tot);
2590
2591 if (flags & RTCF_LOCAL) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002592 rth->dst.input = ip_local_deliver;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002593 rth->rt_spec_dst = fl4->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002594 }
2595 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
David S. Miller68a5e3d2011-03-11 20:07:33 -05002596 rth->rt_spec_dst = fl4->saddr;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002597 if (flags & RTCF_LOCAL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002598 !(dev_out->flags & IFF_LOOPBACK)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002599 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002600 RT_CACHE_STAT_INC(out_slow_mc);
2601 }
2602#ifdef CONFIG_IP_MROUTE
David S. Miller982721f2011-02-16 21:44:24 -08002603 if (type == RTN_MULTICAST) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002604 if (IN_DEV_MFORWARD(in_dev) &&
David S. Miller813b3b52011-04-28 14:48:42 -07002605 !ipv4_is_local_multicast(fl4->daddr)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002606 rth->dst.input = ip_mr_input;
2607 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002608 }
2609 }
2610#endif
2611 }
2612
David S. Miller813b3b52011-04-28 14:48:42 -07002613 rt_set_nexthop(rth, fl4, res, fi, type, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002614
David S. Miller5ada5522011-02-17 15:29:00 -08002615 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002616}
2617
Linus Torvalds1da177e2005-04-16 15:20:36 -07002618/*
2619 * Major route resolver routine.
Eric Dumazet0197aa32010-09-30 03:33:58 +00002620 * called with rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002621 */
2622
David S. Miller813b3b52011-04-28 14:48:42 -07002623static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002624{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002625 struct net_device *dev_out = NULL;
Julian Anastasovf61759e2011-12-02 11:39:42 +00002626 __u8 tos = RT_FL_TOS(fl4);
David S. Miller813b3b52011-04-28 14:48:42 -07002627 unsigned int flags = 0;
2628 struct fib_result res;
David S. Miller5ada5522011-02-17 15:29:00 -08002629 struct rtable *rth;
David S. Miller813b3b52011-04-28 14:48:42 -07002630 __be32 orig_daddr;
2631 __be32 orig_saddr;
2632 int orig_oif;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002633
2634 res.fi = NULL;
2635#ifdef CONFIG_IP_MULTIPLE_TABLES
2636 res.r = NULL;
2637#endif
2638
David S. Miller813b3b52011-04-28 14:48:42 -07002639 orig_daddr = fl4->daddr;
2640 orig_saddr = fl4->saddr;
2641 orig_oif = fl4->flowi4_oif;
2642
2643 fl4->flowi4_iif = net->loopback_dev->ifindex;
2644 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2645 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2646 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
David S. Miller44713b62011-03-04 21:24:47 -08002647
David S. Miller010c2702011-02-17 15:37:09 -08002648 rcu_read_lock();
David S. Miller813b3b52011-04-28 14:48:42 -07002649 if (fl4->saddr) {
David S. Millerb23dd4f2011-03-02 14:31:35 -08002650 rth = ERR_PTR(-EINVAL);
David S. Miller813b3b52011-04-28 14:48:42 -07002651 if (ipv4_is_multicast(fl4->saddr) ||
2652 ipv4_is_lbcast(fl4->saddr) ||
2653 ipv4_is_zeronet(fl4->saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002654 goto out;
2655
Linus Torvalds1da177e2005-04-16 15:20:36 -07002656 /* I removed check for oif == dev_out->oif here.
2657 It was wrong for two reasons:
Denis V. Lunev1ab35272008-01-22 22:04:30 -08002658 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2659 is assigned to multiple interfaces.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002660 2. Moreover, we are allowed to send packets with saddr
2661 of another iface. --ANK
2662 */
2663
David S. Miller813b3b52011-04-28 14:48:42 -07002664 if (fl4->flowi4_oif == 0 &&
2665 (ipv4_is_multicast(fl4->daddr) ||
2666 ipv4_is_lbcast(fl4->daddr))) {
Julian Anastasova210d012008-10-01 07:28:28 -07002667 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
David S. Miller813b3b52011-04-28 14:48:42 -07002668 dev_out = __ip_dev_find(net, fl4->saddr, false);
Julian Anastasova210d012008-10-01 07:28:28 -07002669 if (dev_out == NULL)
2670 goto out;
2671
Linus Torvalds1da177e2005-04-16 15:20:36 -07002672 /* Special hack: user can direct multicasts
2673 and limited broadcast via necessary interface
2674 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2675 This hack is not just for fun, it allows
2676 vic,vat and friends to work.
2677 They bind socket to loopback, set ttl to zero
2678 and expect that it will work.
2679 From the viewpoint of routing cache they are broken,
2680 because we are not allowed to build multicast path
2681 with loopback source addr (look, routing cache
2682 cannot know, that ttl is zero, so that packet
2683 will not leave this host and route is valid).
2684 Luckily, this hack is good workaround.
2685 */
2686
David S. Miller813b3b52011-04-28 14:48:42 -07002687 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002688 goto make_route;
2689 }
Julian Anastasova210d012008-10-01 07:28:28 -07002690
David S. Miller813b3b52011-04-28 14:48:42 -07002691 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
Julian Anastasova210d012008-10-01 07:28:28 -07002692 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
David S. Miller813b3b52011-04-28 14:48:42 -07002693 if (!__ip_dev_find(net, fl4->saddr, false))
Julian Anastasova210d012008-10-01 07:28:28 -07002694 goto out;
Julian Anastasova210d012008-10-01 07:28:28 -07002695 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002696 }
2697
2698
David S. Miller813b3b52011-04-28 14:48:42 -07002699 if (fl4->flowi4_oif) {
2700 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002701 rth = ERR_PTR(-ENODEV);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002702 if (dev_out == NULL)
2703 goto out;
Herbert Xue5ed6392005-10-03 14:35:55 -07002704
2705 /* RACE: Check return value of inet_select_addr instead. */
Eric Dumazetfc75fc82010-12-22 04:39:39 +00002706 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
David S. Millerb23dd4f2011-03-02 14:31:35 -08002707 rth = ERR_PTR(-ENETUNREACH);
Eric Dumazetfc75fc82010-12-22 04:39:39 +00002708 goto out;
2709 }
David S. Miller813b3b52011-04-28 14:48:42 -07002710 if (ipv4_is_local_multicast(fl4->daddr) ||
2711 ipv4_is_lbcast(fl4->daddr)) {
2712 if (!fl4->saddr)
2713 fl4->saddr = inet_select_addr(dev_out, 0,
2714 RT_SCOPE_LINK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002715 goto make_route;
2716 }
David S. Miller813b3b52011-04-28 14:48:42 -07002717 if (fl4->saddr) {
2718 if (ipv4_is_multicast(fl4->daddr))
2719 fl4->saddr = inet_select_addr(dev_out, 0,
2720 fl4->flowi4_scope);
2721 else if (!fl4->daddr)
2722 fl4->saddr = inet_select_addr(dev_out, 0,
2723 RT_SCOPE_HOST);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002724 }
2725 }
2726
David S. Miller813b3b52011-04-28 14:48:42 -07002727 if (!fl4->daddr) {
2728 fl4->daddr = fl4->saddr;
2729 if (!fl4->daddr)
2730 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002731 dev_out = net->loopback_dev;
David S. Miller813b3b52011-04-28 14:48:42 -07002732 fl4->flowi4_oif = net->loopback_dev->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002733 res.type = RTN_LOCAL;
2734 flags |= RTCF_LOCAL;
2735 goto make_route;
2736 }
2737
David S. Miller813b3b52011-04-28 14:48:42 -07002738 if (fib_lookup(net, fl4, &res)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002739 res.fi = NULL;
David S. Miller813b3b52011-04-28 14:48:42 -07002740 if (fl4->flowi4_oif) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002741 /* Apparently, routing tables are wrong. Assume,
2742 that the destination is on link.
2743
2744 WHY? DW.
2745 Because we are allowed to send to iface
2746 even if it has NO routes and NO assigned
2747 addresses. When oif is specified, routing
2748 tables are looked up with only one purpose:
2749 to catch if destination is gatewayed, rather than
2750 direct. Moreover, if MSG_DONTROUTE is set,
2751 we send packet, ignoring both routing tables
2752 and ifaddr state. --ANK
2753
2754
2755 We could make it even if oif is unknown,
2756 likely IPv6, but we do not.
2757 */
2758
David S. Miller813b3b52011-04-28 14:48:42 -07002759 if (fl4->saddr == 0)
2760 fl4->saddr = inet_select_addr(dev_out, 0,
2761 RT_SCOPE_LINK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002762 res.type = RTN_UNICAST;
2763 goto make_route;
2764 }
David S. Millerb23dd4f2011-03-02 14:31:35 -08002765 rth = ERR_PTR(-ENETUNREACH);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002766 goto out;
2767 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002768
2769 if (res.type == RTN_LOCAL) {
David S. Miller813b3b52011-04-28 14:48:42 -07002770 if (!fl4->saddr) {
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002771 if (res.fi->fib_prefsrc)
David S. Miller813b3b52011-04-28 14:48:42 -07002772 fl4->saddr = res.fi->fib_prefsrc;
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002773 else
David S. Miller813b3b52011-04-28 14:48:42 -07002774 fl4->saddr = fl4->daddr;
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002775 }
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002776 dev_out = net->loopback_dev;
David S. Miller813b3b52011-04-28 14:48:42 -07002777 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002778 res.fi = NULL;
2779 flags |= RTCF_LOCAL;
2780 goto make_route;
2781 }
2782
2783#ifdef CONFIG_IP_ROUTE_MULTIPATH
David S. Miller813b3b52011-04-28 14:48:42 -07002784 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
David S. Miller1b7fe5932011-03-10 17:01:16 -08002785 fib_select_multipath(&res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002786 else
2787#endif
David S. Miller21d8c492011-04-14 14:49:37 -07002788 if (!res.prefixlen &&
2789 res.table->tb_num_default > 1 &&
David S. Miller813b3b52011-04-28 14:48:42 -07002790 res.type == RTN_UNICAST && !fl4->flowi4_oif)
David S. Miller0c838ff2011-01-31 16:16:50 -08002791 fib_select_default(&res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002792
David S. Miller813b3b52011-04-28 14:48:42 -07002793 if (!fl4->saddr)
2794 fl4->saddr = FIB_RES_PREFSRC(net, res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002795
Linus Torvalds1da177e2005-04-16 15:20:36 -07002796 dev_out = FIB_RES_DEV(res);
David S. Miller813b3b52011-04-28 14:48:42 -07002797 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002798
2799
2800make_route:
David S. Miller813b3b52011-04-28 14:48:42 -07002801 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
Julian Anastasovf61759e2011-12-02 11:39:42 +00002802 tos, dev_out, flags);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002803 if (!IS_ERR(rth)) {
David S. Miller5ada5522011-02-17 15:29:00 -08002804 unsigned int hash;
2805
David S. Miller813b3b52011-04-28 14:48:42 -07002806 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
David S. Miller5ada5522011-02-17 15:29:00 -08002807 rt_genid(dev_net(dev_out)));
David S. Miller813b3b52011-04-28 14:48:42 -07002808 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
David S. Miller5ada5522011-02-17 15:29:00 -08002809 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002810
David S. Miller010c2702011-02-17 15:37:09 -08002811out:
2812 rcu_read_unlock();
David S. Millerb23dd4f2011-03-02 14:31:35 -08002813 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002814}
2815
David S. Miller813b3b52011-04-28 14:48:42 -07002816struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002817{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002818 struct rtable *rth;
David S. Miller010c2702011-02-17 15:37:09 -08002819 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002820
Neil Horman1080d702008-10-27 12:28:25 -07002821 if (!rt_caching(net))
2822 goto slow_output;
2823
David S. Miller9d6ec932011-03-12 01:12:47 -05002824 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002825
2826 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08002827 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002828 rth = rcu_dereference_bh(rth->dst.rt_next)) {
David S. Miller9d6ec932011-03-12 01:12:47 -05002829 if (rth->rt_key_dst == flp4->daddr &&
2830 rth->rt_key_src == flp4->saddr &&
David S. Millerc7537962010-11-11 17:07:48 -08002831 rt_is_output_route(rth) &&
David S. Miller9d6ec932011-03-12 01:12:47 -05002832 rth->rt_oif == flp4->flowi4_oif &&
2833 rth->rt_mark == flp4->flowi4_mark &&
David S. Miller475949d2011-05-03 19:45:15 -07002834 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
Denis V. Lunevb5921912008-01-22 23:50:25 -08002835 (IPTOS_RT_MASK | RTO_ONLINK)) &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002836 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002837 !rt_is_expired(rth)) {
David S. Millerde398fb2011-12-05 13:21:42 -05002838 ipv4_validate_peer(rth);
Changli Gaod8d1f302010-06-10 23:31:35 -07002839 dst_use(&rth->dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002840 RT_CACHE_STAT_INC(out_hit);
2841 rcu_read_unlock_bh();
David S. Miller56157872011-05-02 14:37:45 -07002842 if (!flp4->saddr)
2843 flp4->saddr = rth->rt_src;
2844 if (!flp4->daddr)
2845 flp4->daddr = rth->rt_dst;
David S. Millerb23dd4f2011-03-02 14:31:35 -08002846 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002847 }
2848 RT_CACHE_STAT_INC(out_hlist_search);
2849 }
2850 rcu_read_unlock_bh();
2851
Neil Horman1080d702008-10-27 12:28:25 -07002852slow_output:
David S. Miller9d6ec932011-03-12 01:12:47 -05002853 return ip_route_output_slow(net, flp4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002854}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002855EXPORT_SYMBOL_GPL(__ip_route_output_key);
2856
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002857static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2858{
2859 return NULL;
2860}
2861
Steffen Klassertebb762f2011-11-23 02:12:51 +00002862static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
Roland Dreierec831ea2011-01-31 13:16:00 -08002863{
Steffen Klassert618f9bc2011-11-23 02:13:31 +00002864 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2865
2866 return mtu ? : dst->dev->mtu;
Roland Dreierec831ea2011-01-31 13:16:00 -08002867}
2868
David S. Miller14e50e52007-05-24 18:17:54 -07002869static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2870{
2871}
2872
Held Bernhard0972ddb2011-04-24 22:07:32 +00002873static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2874 unsigned long old)
2875{
2876 return NULL;
2877}
2878
David S. Miller14e50e52007-05-24 18:17:54 -07002879static struct dst_ops ipv4_dst_blackhole_ops = {
2880 .family = AF_INET,
Harvey Harrison09640e62009-02-01 00:45:17 -08002881 .protocol = cpu_to_be16(ETH_P_IP),
David S. Miller14e50e52007-05-24 18:17:54 -07002882 .destroy = ipv4_dst_destroy,
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002883 .check = ipv4_blackhole_dst_check,
Steffen Klassertebb762f2011-11-23 02:12:51 +00002884 .mtu = ipv4_blackhole_mtu,
Eric Dumazet214f45c2011-02-18 11:39:01 -08002885 .default_advmss = ipv4_default_advmss,
David S. Miller14e50e52007-05-24 18:17:54 -07002886 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
Held Bernhard0972ddb2011-04-24 22:07:32 +00002887 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
David S. Millerd3aaeb32011-07-18 00:40:17 -07002888 .neigh_lookup = ipv4_neigh_lookup,
David S. Miller14e50e52007-05-24 18:17:54 -07002889};
2890
David S. Miller2774c132011-03-01 14:59:04 -08002891struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
David S. Miller14e50e52007-05-24 18:17:54 -07002892{
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002893 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
David S. Miller2774c132011-03-01 14:59:04 -08002894 struct rtable *ort = (struct rtable *) dst_orig;
David S. Miller14e50e52007-05-24 18:17:54 -07002895
2896 if (rt) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002897 struct dst_entry *new = &rt->dst;
David S. Miller14e50e52007-05-24 18:17:54 -07002898
David S. Miller14e50e52007-05-24 18:17:54 -07002899 new->__use = 1;
Herbert Xu352e5122007-11-13 21:34:06 -08002900 new->input = dst_discard;
2901 new->output = dst_discard;
David S. Millerdefb3512010-12-08 21:16:57 -08002902 dst_copy_metrics(new, &ort->dst);
David S. Miller14e50e52007-05-24 18:17:54 -07002903
Changli Gaod8d1f302010-06-10 23:31:35 -07002904 new->dev = ort->dst.dev;
David S. Miller14e50e52007-05-24 18:17:54 -07002905 if (new->dev)
2906 dev_hold(new->dev);
2907
David S. Miller5e2b61f2011-03-04 21:47:09 -08002908 rt->rt_key_dst = ort->rt_key_dst;
2909 rt->rt_key_src = ort->rt_key_src;
David S. Miller475949d2011-05-03 19:45:15 -07002910 rt->rt_key_tos = ort->rt_key_tos;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002911 rt->rt_route_iif = ort->rt_route_iif;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002912 rt->rt_iif = ort->rt_iif;
2913 rt->rt_oif = ort->rt_oif;
2914 rt->rt_mark = ort->rt_mark;
David S. Miller14e50e52007-05-24 18:17:54 -07002915
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002916 rt->rt_genid = rt_genid(net);
David S. Miller14e50e52007-05-24 18:17:54 -07002917 rt->rt_flags = ort->rt_flags;
2918 rt->rt_type = ort->rt_type;
2919 rt->rt_dst = ort->rt_dst;
2920 rt->rt_src = ort->rt_src;
David S. Miller14e50e52007-05-24 18:17:54 -07002921 rt->rt_gateway = ort->rt_gateway;
2922 rt->rt_spec_dst = ort->rt_spec_dst;
2923 rt->peer = ort->peer;
2924 if (rt->peer)
2925 atomic_inc(&rt->peer->refcnt);
David S. Miller62fa8a82011-01-26 20:51:05 -08002926 rt->fi = ort->fi;
2927 if (rt->fi)
2928 atomic_inc(&rt->fi->fib_clntref);
David S. Miller14e50e52007-05-24 18:17:54 -07002929
2930 dst_free(new);
2931 }
2932
David S. Miller2774c132011-03-01 14:59:04 -08002933 dst_release(dst_orig);
2934
2935 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
David S. Miller14e50e52007-05-24 18:17:54 -07002936}
2937
David S. Miller9d6ec932011-03-12 01:12:47 -05002938struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
David S. Millerb23dd4f2011-03-02 14:31:35 -08002939 struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002940{
David S. Miller9d6ec932011-03-12 01:12:47 -05002941 struct rtable *rt = __ip_route_output_key(net, flp4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002942
David S. Millerb23dd4f2011-03-02 14:31:35 -08002943 if (IS_ERR(rt))
2944 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002945
David S. Miller56157872011-05-02 14:37:45 -07002946 if (flp4->flowi4_proto)
David S. Miller9d6ec932011-03-12 01:12:47 -05002947 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2948 flowi4_to_flowi(flp4),
2949 sk, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002950
David S. Millerb23dd4f2011-03-02 14:31:35 -08002951 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002952}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002953EXPORT_SYMBOL_GPL(ip_route_output_flow);
2954
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002955static int rt_fill_info(struct net *net,
2956 struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002957 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002958{
Eric Dumazet511c3f92009-06-02 05:14:27 +00002959 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002960 struct rtmsg *r;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002961 struct nlmsghdr *nlh;
Steffen Klassert2bc8ca42011-10-11 01:12:02 +00002962 unsigned long expires = 0;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00002963 const struct inet_peer *peer = rt->peer;
Thomas Grafe3703b32006-11-27 09:27:07 -08002964 u32 id = 0, ts = 0, tsage = 0, error;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002965
2966 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2967 if (nlh == NULL)
Patrick McHardy26932562007-01-31 23:16:40 -08002968 return -EMSGSIZE;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002969
2970 r = nlmsg_data(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002971 r->rtm_family = AF_INET;
2972 r->rtm_dst_len = 32;
2973 r->rtm_src_len = 0;
David S. Miller475949d2011-05-03 19:45:15 -07002974 r->rtm_tos = rt->rt_key_tos;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002975 r->rtm_table = RT_TABLE_MAIN;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002976 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002977 r->rtm_type = rt->rt_type;
2978 r->rtm_scope = RT_SCOPE_UNIVERSE;
2979 r->rtm_protocol = RTPROT_UNSPEC;
2980 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2981 if (rt->rt_flags & RTCF_NOTIFY)
2982 r->rtm_flags |= RTM_F_NOTIFY;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002983
Al Viro17fb2c62006-09-26 22:15:25 -07002984 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002985
David S. Miller5e2b61f2011-03-04 21:47:09 -08002986 if (rt->rt_key_src) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002987 r->rtm_src_len = 32;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002988 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002989 }
Changli Gaod8d1f302010-06-10 23:31:35 -07002990 if (rt->dst.dev)
2991 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
Patrick McHardyc7066f72011-01-14 13:36:42 +01002992#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07002993 if (rt->dst.tclassid)
2994 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002995#endif
David S. Millerc7537962010-11-11 17:07:48 -08002996 if (rt_is_input_route(rt))
Al Viro17fb2c62006-09-26 22:15:25 -07002997 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
David S. Miller5e2b61f2011-03-04 21:47:09 -08002998 else if (rt->rt_src != rt->rt_key_src)
Al Viro17fb2c62006-09-26 22:15:25 -07002999 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
Thomas Grafbe403ea2006-08-17 18:15:17 -07003000
Linus Torvalds1da177e2005-04-16 15:20:36 -07003001 if (rt->rt_dst != rt->rt_gateway)
Al Viro17fb2c62006-09-26 22:15:25 -07003002 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
Thomas Grafbe403ea2006-08-17 18:15:17 -07003003
David S. Millerdefb3512010-12-08 21:16:57 -08003004 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
Thomas Grafbe403ea2006-08-17 18:15:17 -07003005 goto nla_put_failure;
3006
David S. Miller5e2b61f2011-03-04 21:47:09 -08003007 if (rt->rt_mark)
3008 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
Eric Dumazet963bfee2010-07-20 22:03:14 +00003009
Changli Gaod8d1f302010-06-10 23:31:35 -07003010 error = rt->dst.error;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00003011 if (peer) {
Eric Dumazet317fe0e2010-06-16 04:52:13 +00003012 inet_peer_refcheck(rt->peer);
Eric Dumazetfe6fe792011-06-08 06:07:07 +00003013 id = atomic_read(&peer->ip_id_count) & 0xffff;
3014 if (peer->tcp_ts_stamp) {
3015 ts = peer->tcp_ts;
3016 tsage = get_seconds() - peer->tcp_ts_stamp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003017 }
Eric Dumazetfe6fe792011-06-08 06:07:07 +00003018 expires = ACCESS_ONCE(peer->pmtu_expires);
Steffen Klassert2bc8ca42011-10-11 01:12:02 +00003019 if (expires) {
3020 if (time_before(jiffies, expires))
3021 expires -= jiffies;
3022 else
3023 expires = 0;
3024 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003025 }
Thomas Grafbe403ea2006-08-17 18:15:17 -07003026
David S. Millerc7537962010-11-11 17:07:48 -08003027 if (rt_is_input_route(rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003028#ifdef CONFIG_IP_MROUTE
Al Viroe4485152006-09-26 22:15:01 -07003029 __be32 dst = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003030
Joe Perchesf97c1e02007-12-16 13:45:43 -08003031 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003032 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
David S. Miller9a1b9492011-05-04 12:18:54 -07003033 int err = ipmr_get_route(net, skb,
3034 rt->rt_src, rt->rt_dst,
3035 r, nowait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003036 if (err <= 0) {
3037 if (!nowait) {
3038 if (err == 0)
3039 return 0;
Thomas Grafbe403ea2006-08-17 18:15:17 -07003040 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003041 } else {
3042 if (err == -EMSGSIZE)
Thomas Grafbe403ea2006-08-17 18:15:17 -07003043 goto nla_put_failure;
Thomas Grafe3703b32006-11-27 09:27:07 -08003044 error = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003045 }
3046 }
3047 } else
3048#endif
David S. Miller5e2b61f2011-03-04 21:47:09 -08003049 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003050 }
3051
Changli Gaod8d1f302010-06-10 23:31:35 -07003052 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
Thomas Grafe3703b32006-11-27 09:27:07 -08003053 expires, error) < 0)
3054 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003055
Thomas Grafbe403ea2006-08-17 18:15:17 -07003056 return nlmsg_end(skb, nlh);
3057
3058nla_put_failure:
Patrick McHardy26932562007-01-31 23:16:40 -08003059 nlmsg_cancel(skb, nlh);
3060 return -EMSGSIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003061}
3062
Thomas Graf63f34442007-03-22 11:55:17 -07003063static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003064{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09003065 struct net *net = sock_net(in_skb->sk);
Thomas Grafd889ce32006-08-17 18:15:44 -07003066 struct rtmsg *rtm;
3067 struct nlattr *tb[RTA_MAX+1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07003068 struct rtable *rt = NULL;
Al Viro9e12bb22006-09-26 21:25:20 -07003069 __be32 dst = 0;
3070 __be32 src = 0;
3071 u32 iif;
Thomas Grafd889ce32006-08-17 18:15:44 -07003072 int err;
Eric Dumazet963bfee2010-07-20 22:03:14 +00003073 int mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003074 struct sk_buff *skb;
3075
Thomas Grafd889ce32006-08-17 18:15:44 -07003076 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3077 if (err < 0)
3078 goto errout;
3079
3080 rtm = nlmsg_data(nlh);
3081
Linus Torvalds1da177e2005-04-16 15:20:36 -07003082 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
Thomas Grafd889ce32006-08-17 18:15:44 -07003083 if (skb == NULL) {
3084 err = -ENOBUFS;
3085 goto errout;
3086 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003087
3088 /* Reserve room for dummy headers, this skb can pass
3089 through good chunk of routing engine.
3090 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07003091 skb_reset_mac_header(skb);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07003092 skb_reset_network_header(skb);
Stephen Hemmingerd2c962b2006-04-17 17:27:11 -07003093
3094 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07003095 ip_hdr(skb)->protocol = IPPROTO_ICMP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003096 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3097
Al Viro17fb2c62006-09-26 22:15:25 -07003098 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3099 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
Thomas Grafd889ce32006-08-17 18:15:44 -07003100 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
Eric Dumazet963bfee2010-07-20 22:03:14 +00003101 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003102
3103 if (iif) {
Thomas Grafd889ce32006-08-17 18:15:44 -07003104 struct net_device *dev;
3105
Denis V. Lunev19375042008-02-28 20:52:04 -08003106 dev = __dev_get_by_index(net, iif);
Thomas Grafd889ce32006-08-17 18:15:44 -07003107 if (dev == NULL) {
3108 err = -ENODEV;
3109 goto errout_free;
3110 }
3111
Linus Torvalds1da177e2005-04-16 15:20:36 -07003112 skb->protocol = htons(ETH_P_IP);
3113 skb->dev = dev;
Eric Dumazet963bfee2010-07-20 22:03:14 +00003114 skb->mark = mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003115 local_bh_disable();
3116 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3117 local_bh_enable();
Thomas Grafd889ce32006-08-17 18:15:44 -07003118
Eric Dumazet511c3f92009-06-02 05:14:27 +00003119 rt = skb_rtable(skb);
Changli Gaod8d1f302010-06-10 23:31:35 -07003120 if (err == 0 && rt->dst.error)
3121 err = -rt->dst.error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003122 } else {
David S. Miller68a5e3d2011-03-11 20:07:33 -05003123 struct flowi4 fl4 = {
3124 .daddr = dst,
3125 .saddr = src,
3126 .flowi4_tos = rtm->rtm_tos,
3127 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3128 .flowi4_mark = mark,
Thomas Grafd889ce32006-08-17 18:15:44 -07003129 };
David S. Miller9d6ec932011-03-12 01:12:47 -05003130 rt = ip_route_output_key(net, &fl4);
David S. Millerb23dd4f2011-03-02 14:31:35 -08003131
3132 err = 0;
3133 if (IS_ERR(rt))
3134 err = PTR_ERR(rt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003135 }
Thomas Grafd889ce32006-08-17 18:15:44 -07003136
Linus Torvalds1da177e2005-04-16 15:20:36 -07003137 if (err)
Thomas Grafd889ce32006-08-17 18:15:44 -07003138 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003139
Changli Gaod8d1f302010-06-10 23:31:35 -07003140 skb_dst_set(skb, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003141 if (rtm->rtm_flags & RTM_F_NOTIFY)
3142 rt->rt_flags |= RTCF_NOTIFY;
3143
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003144 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Denis V. Lunev19375042008-02-28 20:52:04 -08003145 RTM_NEWROUTE, 0, 0);
Thomas Grafd889ce32006-08-17 18:15:44 -07003146 if (err <= 0)
3147 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003148
Denis V. Lunev19375042008-02-28 20:52:04 -08003149 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
Thomas Grafd889ce32006-08-17 18:15:44 -07003150errout:
Thomas Graf2942e902006-08-15 00:30:25 -07003151 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003152
Thomas Grafd889ce32006-08-17 18:15:44 -07003153errout_free:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003154 kfree_skb(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07003155 goto errout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003156}
3157
3158int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3159{
3160 struct rtable *rt;
3161 int h, s_h;
3162 int idx, s_idx;
Denis V. Lunev19375042008-02-28 20:52:04 -08003163 struct net *net;
3164
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09003165 net = sock_net(skb->sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003166
3167 s_h = cb->args[0];
Eric Dumazetd8c92832008-01-07 21:52:14 -08003168 if (s_h < 0)
3169 s_h = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003170 s_idx = idx = cb->args[1];
Eric Dumazeta6272662008-08-28 01:11:25 -07003171 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3172 if (!rt_hash_table[h].chain)
3173 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003174 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08003175 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
Changli Gaod8d1f302010-06-10 23:31:35 -07003176 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3177 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003178 continue;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07003179 if (rt_is_expired(rt))
Eric Dumazet29e75252008-01-31 17:05:09 -08003180 continue;
Changli Gaod8d1f302010-06-10 23:31:35 -07003181 skb_dst_set_noref(skb, &rt->dst);
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003182 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003183 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07003184 1, NLM_F_MULTI) <= 0) {
Eric Dumazetadf30902009-06-02 05:19:30 +00003185 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003186 rcu_read_unlock_bh();
3187 goto done;
3188 }
Eric Dumazetadf30902009-06-02 05:19:30 +00003189 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003190 }
3191 rcu_read_unlock_bh();
3192 }
3193
3194done:
3195 cb->args[0] = h;
3196 cb->args[1] = idx;
3197 return skb->len;
3198}
3199
3200void ip_rt_multicast_event(struct in_device *in_dev)
3201{
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -07003202 rt_cache_flush(dev_net(in_dev->dev), 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003203}
3204
3205#ifdef CONFIG_SYSCTL
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003206static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003207 void __user *buffer,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003208 size_t *lenp, loff_t *ppos)
3209{
3210 if (write) {
Denis V. Lunev639e1042008-07-05 19:02:06 -07003211 int flush_delay;
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003212 ctl_table ctl;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003213 struct net *net;
Denis V. Lunev639e1042008-07-05 19:02:06 -07003214
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003215 memcpy(&ctl, __ctl, sizeof(ctl));
3216 ctl.data = &flush_delay;
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003217 proc_dointvec(&ctl, write, buffer, lenp, ppos);
Denis V. Lunev639e1042008-07-05 19:02:06 -07003218
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003219 net = (struct net *)__ctl->extra1;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003220 rt_cache_flush(net, flush_delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003221 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003222 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003223
3224 return -EINVAL;
3225}
3226
Al Viroeeb61f72008-07-27 08:59:33 +01003227static ctl_table ipv4_route_table[] = {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003228 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003229 .procname = "gc_thresh",
3230 .data = &ipv4_dst_ops.gc_thresh,
3231 .maxlen = sizeof(int),
3232 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003233 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003234 },
3235 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003236 .procname = "max_size",
3237 .data = &ip_rt_max_size,
3238 .maxlen = sizeof(int),
3239 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003240 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003241 },
3242 {
3243 /* Deprecated. Use gc_min_interval_ms */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003244
Linus Torvalds1da177e2005-04-16 15:20:36 -07003245 .procname = "gc_min_interval",
3246 .data = &ip_rt_gc_min_interval,
3247 .maxlen = sizeof(int),
3248 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003249 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003250 },
3251 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003252 .procname = "gc_min_interval_ms",
3253 .data = &ip_rt_gc_min_interval,
3254 .maxlen = sizeof(int),
3255 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003256 .proc_handler = proc_dointvec_ms_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003257 },
3258 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003259 .procname = "gc_timeout",
3260 .data = &ip_rt_gc_timeout,
3261 .maxlen = sizeof(int),
3262 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003263 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003264 },
3265 {
Eric Dumazet9f28a2f2011-12-21 15:47:16 -05003266 .procname = "gc_interval",
3267 .data = &ip_rt_gc_interval,
3268 .maxlen = sizeof(int),
3269 .mode = 0644,
3270 .proc_handler = proc_dointvec_jiffies,
3271 },
3272 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003273 .procname = "redirect_load",
3274 .data = &ip_rt_redirect_load,
3275 .maxlen = sizeof(int),
3276 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003277 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003278 },
3279 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003280 .procname = "redirect_number",
3281 .data = &ip_rt_redirect_number,
3282 .maxlen = sizeof(int),
3283 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003284 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003285 },
3286 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003287 .procname = "redirect_silence",
3288 .data = &ip_rt_redirect_silence,
3289 .maxlen = sizeof(int),
3290 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003291 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003292 },
3293 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003294 .procname = "error_cost",
3295 .data = &ip_rt_error_cost,
3296 .maxlen = sizeof(int),
3297 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003298 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003299 },
3300 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003301 .procname = "error_burst",
3302 .data = &ip_rt_error_burst,
3303 .maxlen = sizeof(int),
3304 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003305 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003306 },
3307 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003308 .procname = "gc_elasticity",
3309 .data = &ip_rt_gc_elasticity,
3310 .maxlen = sizeof(int),
3311 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003312 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003313 },
3314 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003315 .procname = "mtu_expires",
3316 .data = &ip_rt_mtu_expires,
3317 .maxlen = sizeof(int),
3318 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003319 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003320 },
3321 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003322 .procname = "min_pmtu",
3323 .data = &ip_rt_min_pmtu,
3324 .maxlen = sizeof(int),
3325 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003326 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003327 },
3328 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003329 .procname = "min_adv_mss",
3330 .data = &ip_rt_min_advmss,
3331 .maxlen = sizeof(int),
3332 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003333 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003334 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003335 { }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003336};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003337
Al Viro2f4520d2008-08-25 15:17:44 -07003338static struct ctl_table empty[1];
3339
3340static struct ctl_table ipv4_skeleton[] =
3341{
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003342 { .procname = "route",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003343 .mode = 0555, .child = ipv4_route_table},
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003344 { .procname = "neigh",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003345 .mode = 0555, .child = empty},
Al Viro2f4520d2008-08-25 15:17:44 -07003346 { }
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003347};
3348
Al Viro2f4520d2008-08-25 15:17:44 -07003349static __net_initdata struct ctl_path ipv4_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003350 { .procname = "net", },
3351 { .procname = "ipv4", },
Al Viro2f4520d2008-08-25 15:17:44 -07003352 { },
3353};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003354
3355static struct ctl_table ipv4_route_flush_table[] = {
3356 {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003357 .procname = "flush",
3358 .maxlen = sizeof(int),
3359 .mode = 0200,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003360 .proc_handler = ipv4_sysctl_rtcache_flush,
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003361 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003362 { },
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003363};
3364
Al Viro2f4520d2008-08-25 15:17:44 -07003365static __net_initdata struct ctl_path ipv4_route_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003366 { .procname = "net", },
3367 { .procname = "ipv4", },
3368 { .procname = "route", },
Al Viro2f4520d2008-08-25 15:17:44 -07003369 { },
3370};
3371
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003372static __net_init int sysctl_route_net_init(struct net *net)
3373{
3374 struct ctl_table *tbl;
3375
3376 tbl = ipv4_route_flush_table;
Octavian Purdila09ad9bc2009-11-25 15:14:13 -08003377 if (!net_eq(net, &init_net)) {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003378 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3379 if (tbl == NULL)
3380 goto err_dup;
3381 }
3382 tbl[0].extra1 = net;
3383
3384 net->ipv4.route_hdr =
3385 register_net_sysctl_table(net, ipv4_route_path, tbl);
3386 if (net->ipv4.route_hdr == NULL)
3387 goto err_reg;
3388 return 0;
3389
3390err_reg:
3391 if (tbl != ipv4_route_flush_table)
3392 kfree(tbl);
3393err_dup:
3394 return -ENOMEM;
3395}
3396
3397static __net_exit void sysctl_route_net_exit(struct net *net)
3398{
3399 struct ctl_table *tbl;
3400
3401 tbl = net->ipv4.route_hdr->ctl_table_arg;
3402 unregister_net_sysctl_table(net->ipv4.route_hdr);
3403 BUG_ON(tbl == ipv4_route_flush_table);
3404 kfree(tbl);
3405}
3406
3407static __net_initdata struct pernet_operations sysctl_route_ops = {
3408 .init = sysctl_route_net_init,
3409 .exit = sysctl_route_net_exit,
3410};
Linus Torvalds1da177e2005-04-16 15:20:36 -07003411#endif
3412
Neil Horman3ee94372010-05-08 01:57:52 -07003413static __net_init int rt_genid_init(struct net *net)
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003414{
Neil Horman3ee94372010-05-08 01:57:52 -07003415 get_random_bytes(&net->ipv4.rt_genid,
3416 sizeof(net->ipv4.rt_genid));
David S. Miller436c3b62011-03-24 17:42:21 -07003417 get_random_bytes(&net->ipv4.dev_addr_genid,
3418 sizeof(net->ipv4.dev_addr_genid));
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003419 return 0;
3420}
3421
Neil Horman3ee94372010-05-08 01:57:52 -07003422static __net_initdata struct pernet_operations rt_genid_ops = {
3423 .init = rt_genid_init,
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003424};
3425
3426
Patrick McHardyc7066f72011-01-14 13:36:42 +01003427#ifdef CONFIG_IP_ROUTE_CLASSID
Tejun Heo7d720c32010-02-16 15:20:26 +00003428struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
Patrick McHardyc7066f72011-01-14 13:36:42 +01003429#endif /* CONFIG_IP_ROUTE_CLASSID */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003430
3431static __initdata unsigned long rhash_entries;
3432static int __init set_rhash_entries(char *str)
3433{
3434 if (!str)
3435 return 0;
3436 rhash_entries = simple_strtoul(str, &str, 0);
3437 return 1;
3438}
3439__setup("rhash_entries=", set_rhash_entries);
3440
3441int __init ip_rt_init(void)
3442{
Eric Dumazet424c4b72005-07-05 14:58:19 -07003443 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003444
Patrick McHardyc7066f72011-01-14 13:36:42 +01003445#ifdef CONFIG_IP_ROUTE_CLASSID
Ingo Molnar0dcec8c2009-02-25 14:07:33 +01003446 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003447 if (!ip_rt_acct)
3448 panic("IP: failed to allocate ip_rt_acct\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003449#endif
3450
Alexey Dobriyane5d679f2006-08-26 19:25:52 -07003451 ipv4_dst_ops.kmem_cachep =
3452 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09003453 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003454
David S. Miller14e50e52007-05-24 18:17:54 -07003455 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3456
Eric Dumazetfc66f952010-10-08 06:37:34 +00003457 if (dst_entries_init(&ipv4_dst_ops) < 0)
3458 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3459
3460 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3461 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3462
Eric Dumazet424c4b72005-07-05 14:58:19 -07003463 rt_hash_table = (struct rt_hash_bucket *)
3464 alloc_large_system_hash("IP route cache",
3465 sizeof(struct rt_hash_bucket),
3466 rhash_entries,
Jan Beulich44813742009-09-21 17:03:05 -07003467 (totalram_pages >= 128 * 1024) ?
Mike Stroyan18955cf2005-11-29 16:12:55 -08003468 15 : 17,
Kirill Korotaev8d1502d2006-08-07 20:44:22 -07003469 0,
Eric Dumazet424c4b72005-07-05 14:58:19 -07003470 &rt_hash_log,
3471 &rt_hash_mask,
Anton Blanchardc9503e02009-04-27 05:42:24 -07003472 rhash_entries ? 0 : 512 * 1024);
Eric Dumazet22c047c2005-07-05 14:55:24 -07003473 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3474 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003475
3476 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3477 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3478
Linus Torvalds1da177e2005-04-16 15:20:36 -07003479 devinet_init();
3480 ip_fib_init();
3481
Eric Dumazet9f28a2f2011-12-21 15:47:16 -05003482 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3483 expires_ljiffies = jiffies;
3484 schedule_delayed_work(&expires_work,
3485 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3486
Denis V. Lunev73b38712008-02-28 20:51:18 -08003487 if (ip_rt_proc_init())
Pavel Emelyanov107f1632007-12-05 21:14:28 -08003488 printk(KERN_ERR "Unable to create route proc files\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003489#ifdef CONFIG_XFRM
3490 xfrm_init();
Neil Hormana33bc5c2009-07-30 18:52:15 -07003491 xfrm4_init(ip_rt_max_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003492#endif
Greg Rosec7ac8672011-06-10 01:27:09 +00003493 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
Thomas Graf63f34442007-03-22 11:55:17 -07003494
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003495#ifdef CONFIG_SYSCTL
3496 register_pernet_subsys(&sysctl_route_ops);
3497#endif
Neil Horman3ee94372010-05-08 01:57:52 -07003498 register_pernet_subsys(&rt_genid_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003499 return rc;
3500}
3501
Al Viroa1bc6eb2008-07-30 06:32:52 -04003502#ifdef CONFIG_SYSCTL
Al Viroeeb61f72008-07-27 08:59:33 +01003503/*
3504 * We really need to sanitize the damn ipv4 init order, then all
3505 * this nonsense will go away.
3506 */
3507void __init ip_static_sysctl_init(void)
3508{
Al Viro2f4520d2008-08-25 15:17:44 -07003509 register_sysctl_paths(ipv4_path, ipv4_skeleton);
Al Viroeeb61f72008-07-27 08:59:33 +01003510}
Al Viroa1bc6eb2008-07-30 06:32:52 -04003511#endif