blob: 85cc053d9d6ec19238ef44861db1ac71a4a4a4b1 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
Jesper Juhl02c30a82005-05-05 16:16:16 -07008 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -07009 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090021 * Alan Cox : Super /proc >4K
Linus Torvalds1da177e2005-04-16 15:20:36 -070022 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090039 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070055 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Ilia Sotnikovcef26852006-03-25 01:38:55 -080056 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
Linus Torvalds1da177e2005-04-16 15:20:36 -070058 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
Linus Torvalds1da177e2005-04-16 15:20:36 -070065#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070071#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070072#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070073#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
Eric Dumazet39c90ec2007-09-15 10:55:54 -070082#include <linux/workqueue.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070083#include <linux/skbuff.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070084#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090093#include <linux/slab.h>
Stephen Rothwellb9eda062011-12-22 17:03:29 +110094#include <linux/prefetch.h>
Herbert Xu352e5122007-11-13 21:34:06 -080095#include <net/dst.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020096#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070097#include <net/protocol.h>
98#include <net/ip.h>
99#include <net/route.h>
100#include <net/inetpeer.h>
101#include <net/sock.h>
102#include <net/ip_fib.h>
103#include <net/arp.h>
104#include <net/tcp.h>
105#include <net/icmp.h>
106#include <net/xfrm.h>
Tom Tucker8d717402006-07-30 20:43:36 -0700107#include <net/netevent.h>
Thomas Graf63f34442007-03-22 11:55:17 -0700108#include <net/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700109#ifdef CONFIG_SYSCTL
110#include <linux/sysctl.h>
111#endif
David Miller3769cff2011-07-11 22:44:24 +0000112#include <net/atmclip.h>
David S. Miller6e5714e2011-08-03 20:50:44 -0700113#include <net/secure_seq.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700114
David S. Miller68a5e3d2011-03-11 20:07:33 -0500115#define RT_FL_TOS(oldflp4) \
Julian Anastasovf61759e2011-12-02 11:39:42 +0000116 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700117
118#define IP_MAX_MTU 0xFFF0
119
120#define RT_GC_TIMEOUT (300*HZ)
121
Linus Torvalds1da177e2005-04-16 15:20:36 -0700122static int ip_rt_max_size;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700123static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500124static int ip_rt_gc_interval __read_mostly = 60 * HZ;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700125static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
126static int ip_rt_redirect_number __read_mostly = 9;
127static int ip_rt_redirect_load __read_mostly = HZ / 50;
128static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
129static int ip_rt_error_cost __read_mostly = HZ;
130static int ip_rt_error_burst __read_mostly = 5 * HZ;
131static int ip_rt_gc_elasticity __read_mostly = 8;
132static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
133static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
134static int ip_rt_min_advmss __read_mostly = 256;
Neil Horman1080d702008-10-27 12:28:25 -0700135static int rt_chain_length_max __read_mostly = 20;
Eric Dumazetde68dca2011-11-26 12:13:44 +0000136static int redirect_genid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700137
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500138static struct delayed_work expires_work;
139static unsigned long expires_ljiffies;
140
Linus Torvalds1da177e2005-04-16 15:20:36 -0700141/*
142 * Interface to generic destination cache.
143 */
144
145static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
David S. Miller0dbaee32010-12-13 12:52:14 -0800146static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
Steffen Klassertebb762f2011-11-23 02:12:51 +0000147static unsigned int ipv4_mtu(const struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148static void ipv4_dst_destroy(struct dst_entry *dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700149static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150static void ipv4_link_failure(struct sk_buff *skb);
151static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
Daniel Lezcano569d3642008-01-18 03:56:57 -0800152static int rt_garbage_collect(struct dst_ops *ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700153
Eric Dumazet72cdd1d2010-11-11 07:14:07 +0000154static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
155 int how)
156{
157}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700158
David S. Miller62fa8a82011-01-26 20:51:05 -0800159static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
160{
David S. Miller06582542011-01-27 14:58:42 -0800161 struct rtable *rt = (struct rtable *) dst;
162 struct inet_peer *peer;
163 u32 *p = NULL;
David S. Miller62fa8a82011-01-26 20:51:05 -0800164
David S. Miller06582542011-01-27 14:58:42 -0800165 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -0400166 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller06582542011-01-27 14:58:42 -0800167
168 peer = rt->peer;
169 if (peer) {
David S. Miller62fa8a82011-01-26 20:51:05 -0800170 u32 *old_p = __DST_METRICS_PTR(old);
171 unsigned long prev, new;
172
David S. Miller06582542011-01-27 14:58:42 -0800173 p = peer->metrics;
174 if (inet_metrics_new(peer))
175 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
David S. Miller62fa8a82011-01-26 20:51:05 -0800176
177 new = (unsigned long) p;
178 prev = cmpxchg(&dst->_metrics, old, new);
179
180 if (prev != old) {
David S. Miller62fa8a82011-01-26 20:51:05 -0800181 p = __DST_METRICS_PTR(prev);
182 if (prev & DST_METRICS_READ_ONLY)
183 p = NULL;
184 } else {
David S. Miller62fa8a82011-01-26 20:51:05 -0800185 if (rt->fi) {
186 fib_info_put(rt->fi);
187 rt->fi = NULL;
188 }
189 }
190 }
191 return p;
192}
193
David S. Millerd3aaeb32011-07-18 00:40:17 -0700194static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
195
Linus Torvalds1da177e2005-04-16 15:20:36 -0700196static struct dst_ops ipv4_dst_ops = {
197 .family = AF_INET,
Harvey Harrison09640e62009-02-01 00:45:17 -0800198 .protocol = cpu_to_be16(ETH_P_IP),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700199 .gc = rt_garbage_collect,
200 .check = ipv4_dst_check,
David S. Miller0dbaee32010-12-13 12:52:14 -0800201 .default_advmss = ipv4_default_advmss,
Steffen Klassertebb762f2011-11-23 02:12:51 +0000202 .mtu = ipv4_mtu,
David S. Miller62fa8a82011-01-26 20:51:05 -0800203 .cow_metrics = ipv4_cow_metrics,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700204 .destroy = ipv4_dst_destroy,
205 .ifdown = ipv4_dst_ifdown,
206 .negative_advice = ipv4_negative_advice,
207 .link_failure = ipv4_link_failure,
208 .update_pmtu = ip_rt_update_pmtu,
Herbert Xu1ac06e02008-05-20 14:32:14 -0700209 .local_out = __ip_local_out,
David S. Millerd3aaeb32011-07-18 00:40:17 -0700210 .neigh_lookup = ipv4_neigh_lookup,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700211};
212
213#define ECN_OR_COST(class) TC_PRIO_##class
214
Philippe De Muyter4839c522007-07-09 15:32:57 -0700215const __u8 ip_tos2prio[16] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700216 TC_PRIO_BESTEFFORT,
Dan Siemon4a2b9c32011-03-15 13:56:07 +0000217 ECN_OR_COST(BESTEFFORT),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700218 TC_PRIO_BESTEFFORT,
219 ECN_OR_COST(BESTEFFORT),
220 TC_PRIO_BULK,
221 ECN_OR_COST(BULK),
222 TC_PRIO_BULK,
223 ECN_OR_COST(BULK),
224 TC_PRIO_INTERACTIVE,
225 ECN_OR_COST(INTERACTIVE),
226 TC_PRIO_INTERACTIVE,
227 ECN_OR_COST(INTERACTIVE),
228 TC_PRIO_INTERACTIVE_BULK,
229 ECN_OR_COST(INTERACTIVE_BULK),
230 TC_PRIO_INTERACTIVE_BULK,
231 ECN_OR_COST(INTERACTIVE_BULK)
232};
233
234
235/*
236 * Route cache.
237 */
238
239/* The locking scheme is rather straight forward:
240 *
241 * 1) Read-Copy Update protects the buckets of the central route hash.
242 * 2) Only writers remove entries, and they hold the lock
243 * as they look at rtable reference counts.
244 * 3) Only readers acquire references to rtable entries,
245 * they do so with atomic increments and with the
246 * lock held.
247 */
248
249struct rt_hash_bucket {
Eric Dumazet1c317202010-10-25 21:02:07 +0000250 struct rtable __rcu *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700251};
Neil Horman1080d702008-10-27 12:28:25 -0700252
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700253#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
254 defined(CONFIG_PROVE_LOCKING)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700255/*
256 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
257 * The size of this table is a power of two and depends on the number of CPUS.
Ingo Molnar62051202006-07-03 00:24:59 -0700258 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700259 */
Ingo Molnar62051202006-07-03 00:24:59 -0700260#ifdef CONFIG_LOCKDEP
261# define RT_HASH_LOCK_SZ 256
Eric Dumazet22c047c2005-07-05 14:55:24 -0700262#else
Ingo Molnar62051202006-07-03 00:24:59 -0700263# if NR_CPUS >= 32
264# define RT_HASH_LOCK_SZ 4096
265# elif NR_CPUS >= 16
266# define RT_HASH_LOCK_SZ 2048
267# elif NR_CPUS >= 8
268# define RT_HASH_LOCK_SZ 1024
269# elif NR_CPUS >= 4
270# define RT_HASH_LOCK_SZ 512
271# else
272# define RT_HASH_LOCK_SZ 256
273# endif
Eric Dumazet22c047c2005-07-05 14:55:24 -0700274#endif
275
276static spinlock_t *rt_hash_locks;
277# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800278
279static __init void rt_hash_lock_init(void)
280{
281 int i;
282
283 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
284 GFP_KERNEL);
285 if (!rt_hash_locks)
286 panic("IP: failed to allocate rt_hash_locks\n");
287
288 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
289 spin_lock_init(&rt_hash_locks[i]);
290}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700291#else
292# define rt_hash_lock_addr(slot) NULL
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800293
294static inline void rt_hash_lock_init(void)
295{
296}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700297#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700298
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700299static struct rt_hash_bucket *rt_hash_table __read_mostly;
300static unsigned rt_hash_mask __read_mostly;
301static unsigned int rt_hash_log __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700302
Eric Dumazet2f970d82006-01-17 02:54:36 -0800303static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
Eric Dumazet27f39c72010-05-19 22:07:23 +0000304#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700305
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700306static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700307 int genid)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700308{
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700309 return jhash_3words((__force u32)daddr, (__force u32)saddr,
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700310 idx, genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800311 & rt_hash_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700312}
313
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700314static inline int rt_genid(struct net *net)
315{
316 return atomic_read(&net->ipv4.rt_genid);
317}
318
Linus Torvalds1da177e2005-04-16 15:20:36 -0700319#ifdef CONFIG_PROC_FS
320struct rt_cache_iter_state {
Denis V. Luneva75e9362008-02-28 20:50:55 -0800321 struct seq_net_private p;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700322 int bucket;
Eric Dumazet29e75252008-01-31 17:05:09 -0800323 int genid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700324};
325
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900326static struct rtable *rt_cache_get_first(struct seq_file *seq)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700327{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900328 struct rt_cache_iter_state *st = seq->private;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700329 struct rtable *r = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700330
331 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
Eric Dumazet33d480c2011-08-11 19:30:52 +0000332 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
Eric Dumazeta6272662008-08-28 01:11:25 -0700333 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700334 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -0800335 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Eric Dumazet29e75252008-01-31 17:05:09 -0800336 while (r) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700337 if (dev_net(r->dst.dev) == seq_file_net(seq) &&
Denis V. Luneva75e9362008-02-28 20:50:55 -0800338 r->rt_genid == st->genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800339 return r;
Changli Gaod8d1f302010-06-10 23:31:35 -0700340 r = rcu_dereference_bh(r->dst.rt_next);
Eric Dumazet29e75252008-01-31 17:05:09 -0800341 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700342 rcu_read_unlock_bh();
343 }
Eric Dumazet29e75252008-01-31 17:05:09 -0800344 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700345}
346
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900347static struct rtable *__rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800348 struct rtable *r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700349{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900350 struct rt_cache_iter_state *st = seq->private;
Eric Dumazeta6272662008-08-28 01:11:25 -0700351
Eric Dumazet1c317202010-10-25 21:02:07 +0000352 r = rcu_dereference_bh(r->dst.rt_next);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700353 while (!r) {
354 rcu_read_unlock_bh();
Eric Dumazeta6272662008-08-28 01:11:25 -0700355 do {
356 if (--st->bucket < 0)
357 return NULL;
Eric Dumazet33d480c2011-08-11 19:30:52 +0000358 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700359 rcu_read_lock_bh();
Eric Dumazet1c317202010-10-25 21:02:07 +0000360 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700361 }
Eric Dumazet1c317202010-10-25 21:02:07 +0000362 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700363}
364
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900365static struct rtable *rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800366 struct rtable *r)
367{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900368 struct rt_cache_iter_state *st = seq->private;
369 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700370 if (dev_net(r->dst.dev) != seq_file_net(seq))
Denis V. Luneva75e9362008-02-28 20:50:55 -0800371 continue;
Denis V. Lunev642d6312008-02-28 20:50:33 -0800372 if (r->rt_genid == st->genid)
373 break;
374 }
375 return r;
376}
377
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900378static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700379{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900380 struct rtable *r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700381
382 if (r)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900383 while (pos && (r = rt_cache_get_next(seq, r)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700384 --pos;
385 return pos ? NULL : r;
386}
387
388static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
389{
Eric Dumazet29e75252008-01-31 17:05:09 -0800390 struct rt_cache_iter_state *st = seq->private;
Eric Dumazet29e75252008-01-31 17:05:09 -0800391 if (*pos)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900392 return rt_cache_get_idx(seq, *pos - 1);
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700393 st->genid = rt_genid(seq_file_net(seq));
Eric Dumazet29e75252008-01-31 17:05:09 -0800394 return SEQ_START_TOKEN;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700395}
396
397static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
398{
Eric Dumazet29e75252008-01-31 17:05:09 -0800399 struct rtable *r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700400
401 if (v == SEQ_START_TOKEN)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900402 r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700403 else
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900404 r = rt_cache_get_next(seq, v);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700405 ++*pos;
406 return r;
407}
408
409static void rt_cache_seq_stop(struct seq_file *seq, void *v)
410{
411 if (v && v != SEQ_START_TOKEN)
412 rcu_read_unlock_bh();
413}
414
415static int rt_cache_seq_show(struct seq_file *seq, void *v)
416{
417 if (v == SEQ_START_TOKEN)
418 seq_printf(seq, "%-127s\n",
419 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
420 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
421 "HHUptod\tSpecDst");
422 else {
423 struct rtable *r = v;
David S. Miller69cce1d2011-07-17 23:09:49 -0700424 struct neighbour *n;
Eric Dumazet218fa902011-11-29 20:05:55 +0000425 int len, HHUptod;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700426
Eric Dumazet218fa902011-11-29 20:05:55 +0000427 rcu_read_lock();
David S. Miller69cce1d2011-07-17 23:09:49 -0700428 n = dst_get_neighbour(&r->dst);
Eric Dumazet218fa902011-11-29 20:05:55 +0000429 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
430 rcu_read_unlock();
431
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700432 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
433 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
Changli Gaod8d1f302010-06-10 23:31:35 -0700434 r->dst.dev ? r->dst.dev->name : "*",
Eric Dumazet0eae88f2010-04-20 19:06:52 -0700435 (__force u32)r->rt_dst,
436 (__force u32)r->rt_gateway,
Changli Gaod8d1f302010-06-10 23:31:35 -0700437 r->rt_flags, atomic_read(&r->dst.__refcnt),
438 r->dst.__use, 0, (__force u32)r->rt_src,
David S. Miller0dbaee32010-12-13 12:52:14 -0800439 dst_metric_advmss(&r->dst) + 40,
Changli Gaod8d1f302010-06-10 23:31:35 -0700440 dst_metric(&r->dst, RTAX_WINDOW),
441 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
442 dst_metric(&r->dst, RTAX_RTTVAR)),
David S. Miller475949d2011-05-03 19:45:15 -0700443 r->rt_key_tos,
David S. Millerf6b72b62011-07-14 07:53:20 -0700444 -1,
Eric Dumazet218fa902011-11-29 20:05:55 +0000445 HHUptod,
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700446 r->rt_spec_dst, &len);
447
448 seq_printf(seq, "%*s\n", 127 - len, "");
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900449 }
450 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700451}
452
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700453static const struct seq_operations rt_cache_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700454 .start = rt_cache_seq_start,
455 .next = rt_cache_seq_next,
456 .stop = rt_cache_seq_stop,
457 .show = rt_cache_seq_show,
458};
459
460static int rt_cache_seq_open(struct inode *inode, struct file *file)
461{
Denis V. Luneva75e9362008-02-28 20:50:55 -0800462 return seq_open_net(inode, file, &rt_cache_seq_ops,
Pavel Emelyanovcf7732e2007-10-10 02:29:29 -0700463 sizeof(struct rt_cache_iter_state));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700464}
465
Arjan van de Ven9a321442007-02-12 00:55:35 -0800466static const struct file_operations rt_cache_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700467 .owner = THIS_MODULE,
468 .open = rt_cache_seq_open,
469 .read = seq_read,
470 .llseek = seq_lseek,
Denis V. Luneva75e9362008-02-28 20:50:55 -0800471 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700472};
473
474
475static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
476{
477 int cpu;
478
479 if (*pos == 0)
480 return SEQ_START_TOKEN;
481
Rusty Russell0f231742008-12-29 12:23:42 +0000482 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700483 if (!cpu_possible(cpu))
484 continue;
485 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800486 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700487 }
488 return NULL;
489}
490
491static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
492{
493 int cpu;
494
Rusty Russell0f231742008-12-29 12:23:42 +0000495 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700496 if (!cpu_possible(cpu))
497 continue;
498 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800499 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700500 }
501 return NULL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900502
Linus Torvalds1da177e2005-04-16 15:20:36 -0700503}
504
505static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
506{
507
508}
509
510static int rt_cpu_seq_show(struct seq_file *seq, void *v)
511{
512 struct rt_cache_stat *st = v;
513
514 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700515 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700516 return 0;
517 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900518
Linus Torvalds1da177e2005-04-16 15:20:36 -0700519 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
520 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
Eric Dumazetfc66f952010-10-08 06:37:34 +0000521 dst_entries_get_slow(&ipv4_dst_ops),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700522 st->in_hit,
523 st->in_slow_tot,
524 st->in_slow_mc,
525 st->in_no_route,
526 st->in_brd,
527 st->in_martian_dst,
528 st->in_martian_src,
529
530 st->out_hit,
531 st->out_slow_tot,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900532 st->out_slow_mc,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700533
534 st->gc_total,
535 st->gc_ignored,
536 st->gc_goal_miss,
537 st->gc_dst_overflow,
538 st->in_hlist_search,
539 st->out_hlist_search
540 );
541 return 0;
542}
543
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700544static const struct seq_operations rt_cpu_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700545 .start = rt_cpu_seq_start,
546 .next = rt_cpu_seq_next,
547 .stop = rt_cpu_seq_stop,
548 .show = rt_cpu_seq_show,
549};
550
551
552static int rt_cpu_seq_open(struct inode *inode, struct file *file)
553{
554 return seq_open(file, &rt_cpu_seq_ops);
555}
556
Arjan van de Ven9a321442007-02-12 00:55:35 -0800557static const struct file_operations rt_cpu_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700558 .owner = THIS_MODULE,
559 .open = rt_cpu_seq_open,
560 .read = seq_read,
561 .llseek = seq_lseek,
562 .release = seq_release,
563};
564
Patrick McHardyc7066f72011-01-14 13:36:42 +0100565#ifdef CONFIG_IP_ROUTE_CLASSID
Alexey Dobriyana661c412009-11-25 15:40:35 -0800566static int rt_acct_proc_show(struct seq_file *m, void *v)
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800567{
Alexey Dobriyana661c412009-11-25 15:40:35 -0800568 struct ip_rt_acct *dst, *src;
569 unsigned int i, j;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800570
Alexey Dobriyana661c412009-11-25 15:40:35 -0800571 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
572 if (!dst)
573 return -ENOMEM;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800574
Alexey Dobriyana661c412009-11-25 15:40:35 -0800575 for_each_possible_cpu(i) {
576 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
577 for (j = 0; j < 256; j++) {
578 dst[j].o_bytes += src[j].o_bytes;
579 dst[j].o_packets += src[j].o_packets;
580 dst[j].i_bytes += src[j].i_bytes;
581 dst[j].i_packets += src[j].i_packets;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800582 }
583 }
Alexey Dobriyana661c412009-11-25 15:40:35 -0800584
585 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
586 kfree(dst);
587 return 0;
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800588}
Alexey Dobriyana661c412009-11-25 15:40:35 -0800589
590static int rt_acct_proc_open(struct inode *inode, struct file *file)
591{
592 return single_open(file, rt_acct_proc_show, NULL);
593}
594
595static const struct file_operations rt_acct_proc_fops = {
596 .owner = THIS_MODULE,
597 .open = rt_acct_proc_open,
598 .read = seq_read,
599 .llseek = seq_lseek,
600 .release = single_release,
601};
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800602#endif
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800603
Denis V. Lunev73b38712008-02-28 20:51:18 -0800604static int __net_init ip_rt_do_proc_init(struct net *net)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800605{
606 struct proc_dir_entry *pde;
607
608 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
609 &rt_cache_seq_fops);
610 if (!pde)
611 goto err1;
612
Wang Chen77020722008-02-28 14:14:25 -0800613 pde = proc_create("rt_cache", S_IRUGO,
614 net->proc_net_stat, &rt_cpu_seq_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800615 if (!pde)
616 goto err2;
617
Patrick McHardyc7066f72011-01-14 13:36:42 +0100618#ifdef CONFIG_IP_ROUTE_CLASSID
Alexey Dobriyana661c412009-11-25 15:40:35 -0800619 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800620 if (!pde)
621 goto err3;
622#endif
623 return 0;
624
Patrick McHardyc7066f72011-01-14 13:36:42 +0100625#ifdef CONFIG_IP_ROUTE_CLASSID
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800626err3:
627 remove_proc_entry("rt_cache", net->proc_net_stat);
628#endif
629err2:
630 remove_proc_entry("rt_cache", net->proc_net);
631err1:
632 return -ENOMEM;
633}
Denis V. Lunev73b38712008-02-28 20:51:18 -0800634
635static void __net_exit ip_rt_do_proc_exit(struct net *net)
636{
637 remove_proc_entry("rt_cache", net->proc_net_stat);
638 remove_proc_entry("rt_cache", net->proc_net);
Patrick McHardyc7066f72011-01-14 13:36:42 +0100639#ifdef CONFIG_IP_ROUTE_CLASSID
Denis V. Lunev73b38712008-02-28 20:51:18 -0800640 remove_proc_entry("rt_acct", net->proc_net);
Alexey Dobriyan0a931ac2010-01-17 03:32:50 +0000641#endif
Denis V. Lunev73b38712008-02-28 20:51:18 -0800642}
643
644static struct pernet_operations ip_rt_proc_ops __net_initdata = {
645 .init = ip_rt_do_proc_init,
646 .exit = ip_rt_do_proc_exit,
647};
648
649static int __init ip_rt_proc_init(void)
650{
651 return register_pernet_subsys(&ip_rt_proc_ops);
652}
653
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800654#else
Denis V. Lunev73b38712008-02-28 20:51:18 -0800655static inline int ip_rt_proc_init(void)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800656{
657 return 0;
658}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700659#endif /* CONFIG_PROC_FS */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900660
Stephen Hemminger5969f712008-04-10 01:52:09 -0700661static inline void rt_free(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700662{
Changli Gaod8d1f302010-06-10 23:31:35 -0700663 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700664}
665
Stephen Hemminger5969f712008-04-10 01:52:09 -0700666static inline void rt_drop(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700667{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700668 ip_rt_put(rt);
Changli Gaod8d1f302010-06-10 23:31:35 -0700669 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700670}
671
Stephen Hemminger5969f712008-04-10 01:52:09 -0700672static inline int rt_fast_clean(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700673{
674 /* Kill broadcast/multicast entries very aggresively, if they
675 collide in hash table with more useful entries */
676 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
David S. Millerc7537962010-11-11 17:07:48 -0800677 rt_is_input_route(rth) && rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700678}
679
Stephen Hemminger5969f712008-04-10 01:52:09 -0700680static inline int rt_valuable(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700681{
682 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
David S. Miller2c8cec52011-02-09 20:42:07 -0800683 (rth->peer && rth->peer->pmtu_expires);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700684}
685
686static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
687{
688 unsigned long age;
689 int ret = 0;
690
Changli Gaod8d1f302010-06-10 23:31:35 -0700691 if (atomic_read(&rth->dst.__refcnt))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700692 goto out;
693
Changli Gaod8d1f302010-06-10 23:31:35 -0700694 age = jiffies - rth->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700695 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
696 (age <= tmo2 && rt_valuable(rth)))
697 goto out;
698 ret = 1;
699out: return ret;
700}
701
702/* Bits of score are:
703 * 31: very valuable
704 * 30: not quite useless
705 * 29..0: usage counter
706 */
707static inline u32 rt_score(struct rtable *rt)
708{
Changli Gaod8d1f302010-06-10 23:31:35 -0700709 u32 score = jiffies - rt->dst.lastuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700710
711 score = ~score & ~(3<<30);
712
713 if (rt_valuable(rt))
714 score |= (1<<31);
715
David S. Millerc7537962010-11-11 17:07:48 -0800716 if (rt_is_output_route(rt) ||
Linus Torvalds1da177e2005-04-16 15:20:36 -0700717 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
718 score |= (1<<30);
719
720 return score;
721}
722
Neil Horman1080d702008-10-27 12:28:25 -0700723static inline bool rt_caching(const struct net *net)
724{
725 return net->ipv4.current_rt_cache_rebuild_count <=
726 net->ipv4.sysctl_rt_cache_rebuild_count;
727}
728
David S. Miller5e2b61f2011-03-04 21:47:09 -0800729static inline bool compare_hash_inputs(const struct rtable *rt1,
730 const struct rtable *rt2)
Neil Horman1080d702008-10-27 12:28:25 -0700731{
David S. Miller5e2b61f2011-03-04 21:47:09 -0800732 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
733 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
Julian Anastasov97a80412011-08-09 04:01:16 +0000734 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
Neil Horman1080d702008-10-27 12:28:25 -0700735}
736
David S. Miller5e2b61f2011-03-04 21:47:09 -0800737static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700738{
David S. Miller5e2b61f2011-03-04 21:47:09 -0800739 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
740 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
741 (rt1->rt_mark ^ rt2->rt_mark) |
David S. Miller475949d2011-05-03 19:45:15 -0700742 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
Julian Anastasovd547f722011-08-07 22:20:20 -0700743 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
Julian Anastasov97a80412011-08-09 04:01:16 +0000744 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700745}
746
Denis V. Lunevb5921912008-01-22 23:50:25 -0800747static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
748{
Changli Gaod8d1f302010-06-10 23:31:35 -0700749 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
Denis V. Lunevb5921912008-01-22 23:50:25 -0800750}
751
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700752static inline int rt_is_expired(struct rtable *rth)
753{
Changli Gaod8d1f302010-06-10 23:31:35 -0700754 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700755}
756
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800757/*
758 * Perform a full scan of hash table and free all entries.
759 * Can be called by a softirq or a process.
760 * In the later case, we want to be reschedule if necessary
761 */
David S. Miller6561a3b2010-12-19 21:11:20 -0800762static void rt_do_flush(struct net *net, int process_context)
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800763{
764 unsigned int i;
765 struct rtable *rth, *next;
766
767 for (i = 0; i <= rt_hash_mask; i++) {
David S. Miller6561a3b2010-12-19 21:11:20 -0800768 struct rtable __rcu **pprev;
769 struct rtable *list;
770
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800771 if (process_context && need_resched())
772 cond_resched();
Eric Dumazet33d480c2011-08-11 19:30:52 +0000773 rth = rcu_access_pointer(rt_hash_table[i].chain);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800774 if (!rth)
775 continue;
776
777 spin_lock_bh(rt_hash_lock_addr(i));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700778
David S. Miller6561a3b2010-12-19 21:11:20 -0800779 list = NULL;
780 pprev = &rt_hash_table[i].chain;
781 rth = rcu_dereference_protected(*pprev,
Eric Dumazet1c317202010-10-25 21:02:07 +0000782 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700783
David S. Miller6561a3b2010-12-19 21:11:20 -0800784 while (rth) {
785 next = rcu_dereference_protected(rth->dst.rt_next,
786 lockdep_is_held(rt_hash_lock_addr(i)));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700787
David S. Miller6561a3b2010-12-19 21:11:20 -0800788 if (!net ||
789 net_eq(dev_net(rth->dst.dev), net)) {
790 rcu_assign_pointer(*pprev, next);
791 rcu_assign_pointer(rth->dst.rt_next, list);
792 list = rth;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700793 } else {
David S. Miller6561a3b2010-12-19 21:11:20 -0800794 pprev = &rth->dst.rt_next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700795 }
David S. Miller6561a3b2010-12-19 21:11:20 -0800796 rth = next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700797 }
David S. Miller6561a3b2010-12-19 21:11:20 -0800798
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800799 spin_unlock_bh(rt_hash_lock_addr(i));
800
David S. Miller6561a3b2010-12-19 21:11:20 -0800801 for (; list; list = next) {
802 next = rcu_dereference_protected(list->dst.rt_next, 1);
803 rt_free(list);
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800804 }
805 }
806}
807
Neil Horman1080d702008-10-27 12:28:25 -0700808/*
809 * While freeing expired entries, we compute average chain length
810 * and standard deviation, using fixed-point arithmetic.
811 * This to have an estimation of rt_chain_length_max
812 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
813 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
814 */
815
816#define FRACT_BITS 3
817#define ONE (1UL << FRACT_BITS)
818
Eric Dumazet98376382010-03-08 03:20:00 +0000819/*
820 * Given a hash chain and an item in this hash chain,
821 * find if a previous entry has the same hash_inputs
822 * (but differs on tos, mark or oif)
823 * Returns 0 if an alias is found.
824 * Returns ONE if rth has no alias before itself.
825 */
826static int has_noalias(const struct rtable *head, const struct rtable *rth)
827{
828 const struct rtable *aux = head;
829
830 while (aux != rth) {
David S. Miller5e2b61f2011-03-04 21:47:09 -0800831 if (compare_hash_inputs(aux, rth))
Eric Dumazet98376382010-03-08 03:20:00 +0000832 return 0;
Eric Dumazet1c317202010-10-25 21:02:07 +0000833 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +0000834 }
835 return ONE;
836}
837
Eric Dumazet9f28a2f2011-12-21 15:47:16 -0500838static void rt_check_expire(void)
839{
840 static unsigned int rover;
841 unsigned int i = rover, goal;
842 struct rtable *rth;
843 struct rtable __rcu **rthp;
844 unsigned long samples = 0;
845 unsigned long sum = 0, sum2 = 0;
846 unsigned long delta;
847 u64 mult;
848
849 delta = jiffies - expires_ljiffies;
850 expires_ljiffies = jiffies;
851 mult = ((u64)delta) << rt_hash_log;
852 if (ip_rt_gc_timeout > 1)
853 do_div(mult, ip_rt_gc_timeout);
854 goal = (unsigned int)mult;
855 if (goal > rt_hash_mask)
856 goal = rt_hash_mask + 1;
857 for (; goal > 0; goal--) {
858 unsigned long tmo = ip_rt_gc_timeout;
859 unsigned long length;
860
861 i = (i + 1) & rt_hash_mask;
862 rthp = &rt_hash_table[i].chain;
863
864 if (need_resched())
865 cond_resched();
866
867 samples++;
868
869 if (rcu_dereference_raw(*rthp) == NULL)
870 continue;
871 length = 0;
872 spin_lock_bh(rt_hash_lock_addr(i));
873 while ((rth = rcu_dereference_protected(*rthp,
874 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
875 prefetch(rth->dst.rt_next);
876 if (rt_is_expired(rth)) {
877 *rthp = rth->dst.rt_next;
878 rt_free(rth);
879 continue;
880 }
881 if (rth->dst.expires) {
882 /* Entry is expired even if it is in use */
883 if (time_before_eq(jiffies, rth->dst.expires)) {
884nofree:
885 tmo >>= 1;
886 rthp = &rth->dst.rt_next;
887 /*
888 * We only count entries on
889 * a chain with equal hash inputs once
890 * so that entries for different QOS
891 * levels, and other non-hash input
892 * attributes don't unfairly skew
893 * the length computation
894 */
895 length += has_noalias(rt_hash_table[i].chain, rth);
896 continue;
897 }
898 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
899 goto nofree;
900
901 /* Cleanup aged off entries. */
902 *rthp = rth->dst.rt_next;
903 rt_free(rth);
904 }
905 spin_unlock_bh(rt_hash_lock_addr(i));
906 sum += length;
907 sum2 += length*length;
908 }
909 if (samples) {
910 unsigned long avg = sum / samples;
911 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
912 rt_chain_length_max = max_t(unsigned long,
913 ip_rt_gc_elasticity,
914 (avg + 4*sd) >> FRACT_BITS);
915 }
916 rover = i;
917}
918
919/*
920 * rt_worker_func() is run in process context.
921 * we call rt_check_expire() to scan part of the hash table
922 */
923static void rt_worker_func(struct work_struct *work)
924{
925 rt_check_expire();
926 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
927}
928
Eric Dumazet29e75252008-01-31 17:05:09 -0800929/*
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300930 * Perturbation of rt_genid by a small quantity [1..256]
Eric Dumazet29e75252008-01-31 17:05:09 -0800931 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
932 * many times (2^24) without giving recent rt_genid.
933 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700934 */
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700935static void rt_cache_invalidate(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700936{
Eric Dumazet29e75252008-01-31 17:05:09 -0800937 unsigned char shuffle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700938
Eric Dumazet29e75252008-01-31 17:05:09 -0800939 get_random_bytes(&shuffle, sizeof(shuffle));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700940 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
Eric Dumazetde68dca2011-11-26 12:13:44 +0000941 redirect_genid++;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700942}
943
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800944/*
Eric Dumazet29e75252008-01-31 17:05:09 -0800945 * delay < 0 : invalidate cache (fast : entries will be deleted later)
946 * delay >= 0 : invalidate & flush cache (can be long)
947 */
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -0700948void rt_cache_flush(struct net *net, int delay)
Eric Dumazet29e75252008-01-31 17:05:09 -0800949{
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700950 rt_cache_invalidate(net);
Eric Dumazet29e75252008-01-31 17:05:09 -0800951 if (delay >= 0)
David S. Miller6561a3b2010-12-19 21:11:20 -0800952 rt_do_flush(net, !in_softirq());
Eric Dumazet29e75252008-01-31 17:05:09 -0800953}
954
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000955/* Flush previous cache invalidated entries from the cache */
David S. Miller6561a3b2010-12-19 21:11:20 -0800956void rt_cache_flush_batch(struct net *net)
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000957{
David S. Miller6561a3b2010-12-19 21:11:20 -0800958 rt_do_flush(net, !in_softirq());
Eric W. Biedermana5ee1552009-11-29 15:45:58 +0000959}
960
Neil Horman1080d702008-10-27 12:28:25 -0700961static void rt_emergency_hash_rebuild(struct net *net)
962{
Neil Horman3ee94372010-05-08 01:57:52 -0700963 if (net_ratelimit())
Neil Horman1080d702008-10-27 12:28:25 -0700964 printk(KERN_WARNING "Route hash chain too long!\n");
Neil Horman3ee94372010-05-08 01:57:52 -0700965 rt_cache_invalidate(net);
Neil Horman1080d702008-10-27 12:28:25 -0700966}
967
Linus Torvalds1da177e2005-04-16 15:20:36 -0700968/*
969 Short description of GC goals.
970
971 We want to build algorithm, which will keep routing cache
972 at some equilibrium point, when number of aged off entries
973 is kept approximately equal to newly generated ones.
974
975 Current expiration strength is variable "expire".
976 We try to adjust it dynamically, so that if networking
977 is idle expires is large enough to keep enough of warm entries,
978 and when load increases it reduces to limit cache size.
979 */
980
Daniel Lezcano569d3642008-01-18 03:56:57 -0800981static int rt_garbage_collect(struct dst_ops *ops)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700982{
983 static unsigned long expire = RT_GC_TIMEOUT;
984 static unsigned long last_gc;
985 static int rover;
986 static int equilibrium;
Eric Dumazet1c317202010-10-25 21:02:07 +0000987 struct rtable *rth;
988 struct rtable __rcu **rthp;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700989 unsigned long now = jiffies;
990 int goal;
Eric Dumazetfc66f952010-10-08 06:37:34 +0000991 int entries = dst_entries_get_fast(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700992
993 /*
994 * Garbage collection is pretty expensive,
995 * do not make it too frequently.
996 */
997
998 RT_CACHE_STAT_INC(gc_total);
999
1000 if (now - last_gc < ip_rt_gc_min_interval &&
Eric Dumazetfc66f952010-10-08 06:37:34 +00001001 entries < ip_rt_max_size) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001002 RT_CACHE_STAT_INC(gc_ignored);
1003 goto out;
1004 }
1005
Eric Dumazetfc66f952010-10-08 06:37:34 +00001006 entries = dst_entries_get_slow(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001007 /* Calculate number of entries, which we want to expire now. */
Eric Dumazetfc66f952010-10-08 06:37:34 +00001008 goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001009 if (goal <= 0) {
1010 if (equilibrium < ipv4_dst_ops.gc_thresh)
1011 equilibrium = ipv4_dst_ops.gc_thresh;
Eric Dumazetfc66f952010-10-08 06:37:34 +00001012 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001013 if (goal > 0) {
Eric Dumazetb790ced2007-12-21 01:49:07 -08001014 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +00001015 goal = entries - equilibrium;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001016 }
1017 } else {
1018 /* We are in dangerous area. Try to reduce cache really
1019 * aggressively.
1020 */
Eric Dumazetb790ced2007-12-21 01:49:07 -08001021 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Eric Dumazetfc66f952010-10-08 06:37:34 +00001022 equilibrium = entries - goal;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001023 }
1024
1025 if (now - last_gc >= ip_rt_gc_min_interval)
1026 last_gc = now;
1027
1028 if (goal <= 0) {
1029 equilibrium += goal;
1030 goto work_done;
1031 }
1032
1033 do {
1034 int i, k;
1035
1036 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1037 unsigned long tmo = expire;
1038
1039 k = (k + 1) & rt_hash_mask;
1040 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001041 spin_lock_bh(rt_hash_lock_addr(k));
Eric Dumazet1c317202010-10-25 21:02:07 +00001042 while ((rth = rcu_dereference_protected(*rthp,
1043 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001044 if (!rt_is_expired(rth) &&
Eric Dumazet29e75252008-01-31 17:05:09 -08001045 !rt_may_expire(rth, tmo, expire)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001046 tmo >>= 1;
Changli Gaod8d1f302010-06-10 23:31:35 -07001047 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001048 continue;
1049 }
Changli Gaod8d1f302010-06-10 23:31:35 -07001050 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001051 rt_free(rth);
1052 goal--;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001053 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001054 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001055 if (goal <= 0)
1056 break;
1057 }
1058 rover = k;
1059
1060 if (goal <= 0)
1061 goto work_done;
1062
1063 /* Goal is not achieved. We stop process if:
1064
1065 - if expire reduced to zero. Otherwise, expire is halfed.
1066 - if table is not full.
1067 - if we are called from interrupt.
1068 - jiffies check is just fallback/debug loop breaker.
1069 We will not spin here for long time in any case.
1070 */
1071
1072 RT_CACHE_STAT_INC(gc_goal_miss);
1073
1074 if (expire == 0)
1075 break;
1076
1077 expire >>= 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001078
Eric Dumazetfc66f952010-10-08 06:37:34 +00001079 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001080 goto out;
1081 } while (!in_softirq() && time_before_eq(jiffies, now));
1082
Eric Dumazetfc66f952010-10-08 06:37:34 +00001083 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1084 goto out;
1085 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001086 goto out;
1087 if (net_ratelimit())
1088 printk(KERN_WARNING "dst cache overflow\n");
1089 RT_CACHE_STAT_INC(gc_dst_overflow);
1090 return 1;
1091
1092work_done:
1093 expire += ip_rt_gc_min_interval;
1094 if (expire > ip_rt_gc_timeout ||
Eric Dumazetfc66f952010-10-08 06:37:34 +00001095 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1096 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001097 expire = ip_rt_gc_timeout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001098out: return 0;
1099}
1100
Eric Dumazet98376382010-03-08 03:20:00 +00001101/*
1102 * Returns number of entries in a hash chain that have different hash_inputs
1103 */
1104static int slow_chain_length(const struct rtable *head)
1105{
1106 int length = 0;
1107 const struct rtable *rth = head;
1108
1109 while (rth) {
1110 length += has_noalias(head, rth);
Eric Dumazet1c317202010-10-25 21:02:07 +00001111 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
Eric Dumazet98376382010-03-08 03:20:00 +00001112 }
1113 return length >> FRACT_BITS;
1114}
1115
David S. Millerd3aaeb32011-07-18 00:40:17 -07001116static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
David Miller3769cff2011-07-11 22:44:24 +00001117{
David Miller3769cff2011-07-11 22:44:24 +00001118 struct neigh_table *tbl = &arp_tbl;
David S. Millerd3aaeb32011-07-18 00:40:17 -07001119 static const __be32 inaddr_any = 0;
1120 struct net_device *dev = dst->dev;
1121 const __be32 *pkey = daddr;
David Miller3769cff2011-07-11 22:44:24 +00001122 struct neighbour *n;
1123
1124#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1125 if (dev->type == ARPHRD_ATM)
1126 tbl = clip_tbl_hook;
1127#endif
David Miller3769cff2011-07-11 22:44:24 +00001128 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
David S. Millerd3aaeb32011-07-18 00:40:17 -07001129 pkey = &inaddr_any;
1130
1131 n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
1132 if (n)
1133 return n;
1134 return neigh_create(tbl, pkey, dev);
1135}
1136
1137static int rt_bind_neighbour(struct rtable *rt)
1138{
1139 struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
David Miller3769cff2011-07-11 22:44:24 +00001140 if (IS_ERR(n))
1141 return PTR_ERR(n);
David S. Miller69cce1d2011-07-17 23:09:49 -07001142 dst_set_neighbour(&rt->dst, n);
David Miller3769cff2011-07-11 22:44:24 +00001143
1144 return 0;
1145}
1146
David S. Millerb23dd4f2011-03-02 14:31:35 -08001147static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1148 struct sk_buff *skb, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001149{
Eric Dumazet1c317202010-10-25 21:02:07 +00001150 struct rtable *rth, *cand;
1151 struct rtable __rcu **rthp, **candp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001152 unsigned long now;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001153 u32 min_score;
1154 int chain_length;
1155 int attempts = !in_softirq();
1156
1157restart:
1158 chain_length = 0;
1159 min_score = ~(u32)0;
1160 cand = NULL;
1161 candp = NULL;
1162 now = jiffies;
1163
Changli Gaod8d1f302010-06-10 23:31:35 -07001164 if (!rt_caching(dev_net(rt->dst.dev))) {
Neil Horman73e42892009-06-20 01:15:16 -07001165 /*
1166 * If we're not caching, just tell the caller we
1167 * were successful and don't touch the route. The
1168 * caller hold the sole reference to the cache entry, and
1169 * it will be released when the caller is done with it.
1170 * If we drop it here, the callers have no way to resolve routes
1171 * when we're not caching. Instead, just point *rp at rt, so
1172 * the caller gets a single use out of the route
Neil Hormanb6280b42009-06-22 10:18:53 +00001173 * Note that we do rt_free on this new route entry, so that
1174 * once its refcount hits zero, we are still able to reap it
1175 * (Thanks Alexey)
Eric Dumazet27b75c92010-10-15 05:44:11 +00001176 * Note: To avoid expensive rcu stuff for this uncached dst,
1177 * we set DST_NOCACHE so that dst_release() can free dst without
1178 * waiting a grace period.
Neil Horman73e42892009-06-20 01:15:16 -07001179 */
Neil Hormanb6280b42009-06-22 10:18:53 +00001180
Eric Dumazetc7d44262010-10-03 22:17:54 -07001181 rt->dst.flags |= DST_NOCACHE;
David S. Millerc7537962010-11-11 17:07:48 -08001182 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
David Miller3769cff2011-07-11 22:44:24 +00001183 int err = rt_bind_neighbour(rt);
Neil Hormanb6280b42009-06-22 10:18:53 +00001184 if (err) {
1185 if (net_ratelimit())
1186 printk(KERN_WARNING
1187 "Neighbour table failure & not caching routes.\n");
Eric Dumazet27b75c92010-10-15 05:44:11 +00001188 ip_rt_put(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001189 return ERR_PTR(err);
Neil Hormanb6280b42009-06-22 10:18:53 +00001190 }
1191 }
1192
Neil Hormanb6280b42009-06-22 10:18:53 +00001193 goto skip_hashing;
Neil Horman1080d702008-10-27 12:28:25 -07001194 }
1195
Linus Torvalds1da177e2005-04-16 15:20:36 -07001196 rthp = &rt_hash_table[hash].chain;
1197
Eric Dumazet22c047c2005-07-05 14:55:24 -07001198 spin_lock_bh(rt_hash_lock_addr(hash));
Eric Dumazet1c317202010-10-25 21:02:07 +00001199 while ((rth = rcu_dereference_protected(*rthp,
1200 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001201 if (rt_is_expired(rth)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001202 *rthp = rth->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001203 rt_free(rth);
1204 continue;
1205 }
David S. Miller5e2b61f2011-03-04 21:47:09 -08001206 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001207 /* Put it first */
Changli Gaod8d1f302010-06-10 23:31:35 -07001208 *rthp = rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001209 /*
1210 * Since lookup is lockfree, the deletion
1211 * must be visible to another weakly ordered CPU before
1212 * the insertion at the start of the hash chain.
1213 */
Changli Gaod8d1f302010-06-10 23:31:35 -07001214 rcu_assign_pointer(rth->dst.rt_next,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001215 rt_hash_table[hash].chain);
1216 /*
1217 * Since lookup is lockfree, the update writes
1218 * must be ordered for consistency on SMP.
1219 */
1220 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1221
Changli Gaod8d1f302010-06-10 23:31:35 -07001222 dst_use(&rth->dst, now);
Eric Dumazet22c047c2005-07-05 14:55:24 -07001223 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001224
1225 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001226 if (skb)
Changli Gaod8d1f302010-06-10 23:31:35 -07001227 skb_dst_set(skb, &rth->dst);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001228 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001229 }
1230
Changli Gaod8d1f302010-06-10 23:31:35 -07001231 if (!atomic_read(&rth->dst.__refcnt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001232 u32 score = rt_score(rth);
1233
1234 if (score <= min_score) {
1235 cand = rth;
1236 candp = rthp;
1237 min_score = score;
1238 }
1239 }
1240
1241 chain_length++;
1242
Changli Gaod8d1f302010-06-10 23:31:35 -07001243 rthp = &rth->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001244 }
1245
1246 if (cand) {
1247 /* ip_rt_gc_elasticity used to be average length of chain
1248 * length, when exceeded gc becomes really aggressive.
1249 *
1250 * The second limit is less certain. At the moment it allows
1251 * only 2 entries per bucket. We will see.
1252 */
1253 if (chain_length > ip_rt_gc_elasticity) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001254 *candp = cand->dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001255 rt_free(cand);
1256 }
Neil Horman1080d702008-10-27 12:28:25 -07001257 } else {
Eric Dumazet98376382010-03-08 03:20:00 +00001258 if (chain_length > rt_chain_length_max &&
1259 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001260 struct net *net = dev_net(rt->dst.dev);
Neil Horman1080d702008-10-27 12:28:25 -07001261 int num = ++net->ipv4.current_rt_cache_rebuild_count;
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001262 if (!rt_caching(net)) {
Neil Horman1080d702008-10-27 12:28:25 -07001263 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
Changli Gaod8d1f302010-06-10 23:31:35 -07001264 rt->dst.dev->name, num);
Neil Horman1080d702008-10-27 12:28:25 -07001265 }
Pavel Emelyanovb35ecb52010-03-24 07:43:17 +00001266 rt_emergency_hash_rebuild(net);
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001267 spin_unlock_bh(rt_hash_lock_addr(hash));
1268
David S. Miller5e2b61f2011-03-04 21:47:09 -08001269 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
Pavel Emelyanov6a2bad72010-03-24 21:51:22 +00001270 ifindex, rt_genid(net));
1271 goto restart;
Neil Horman1080d702008-10-27 12:28:25 -07001272 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001273 }
1274
1275 /* Try to bind route to arp only if it is output
1276 route or unicast forwarding path.
1277 */
David S. Millerc7537962010-11-11 17:07:48 -08001278 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
David Miller3769cff2011-07-11 22:44:24 +00001279 int err = rt_bind_neighbour(rt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001280 if (err) {
Eric Dumazet22c047c2005-07-05 14:55:24 -07001281 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001282
1283 if (err != -ENOBUFS) {
1284 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001285 return ERR_PTR(err);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001286 }
1287
1288 /* Neighbour tables are full and nothing
1289 can be released. Try to shrink route cache,
1290 it is most likely it holds some neighbour records.
1291 */
1292 if (attempts-- > 0) {
1293 int saved_elasticity = ip_rt_gc_elasticity;
1294 int saved_int = ip_rt_gc_min_interval;
1295 ip_rt_gc_elasticity = 1;
1296 ip_rt_gc_min_interval = 0;
Daniel Lezcano569d3642008-01-18 03:56:57 -08001297 rt_garbage_collect(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001298 ip_rt_gc_min_interval = saved_int;
1299 ip_rt_gc_elasticity = saved_elasticity;
1300 goto restart;
1301 }
1302
1303 if (net_ratelimit())
Ulrich Weber7e1b33e2010-09-27 15:02:18 -07001304 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07001305 rt_drop(rt);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001306 return ERR_PTR(-ENOBUFS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001307 }
1308 }
1309
Changli Gaod8d1f302010-06-10 23:31:35 -07001310 rt->dst.rt_next = rt_hash_table[hash].chain;
Neil Horman1080d702008-10-27 12:28:25 -07001311
Eric Dumazet00269b52008-10-16 14:18:29 -07001312 /*
1313 * Since lookup is lockfree, we must make sure
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001314 * previous writes to rt are committed to memory
Eric Dumazet00269b52008-10-16 14:18:29 -07001315 * before making rt visible to other CPUS.
1316 */
Eric Dumazet1ddbcb02009-05-19 20:14:28 +00001317 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
Neil Horman1080d702008-10-27 12:28:25 -07001318
Eric Dumazet22c047c2005-07-05 14:55:24 -07001319 spin_unlock_bh(rt_hash_lock_addr(hash));
Neil Horman73e42892009-06-20 01:15:16 -07001320
Neil Hormanb6280b42009-06-22 10:18:53 +00001321skip_hashing:
David S. Millerb23dd4f2011-03-02 14:31:35 -08001322 if (skb)
Changli Gaod8d1f302010-06-10 23:31:35 -07001323 skb_dst_set(skb, &rt->dst);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001324 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001325}
1326
David S. Miller6431cbc2011-02-07 20:38:06 -08001327static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1328
1329static u32 rt_peer_genid(void)
1330{
1331 return atomic_read(&__rt_peer_genid);
1332}
1333
David S. Millera48eff12011-05-18 18:42:43 -04001334void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001335{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001336 struct inet_peer *peer;
1337
David S. Millera48eff12011-05-18 18:42:43 -04001338 peer = inet_getpeer_v4(daddr, create);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001339
Eric Dumazet49e8ab02010-08-19 06:10:45 +00001340 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001341 inet_putpeer(peer);
David S. Miller6431cbc2011-02-07 20:38:06 -08001342 else
1343 rt->rt_peer_genid = rt_peer_genid();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001344}
1345
1346/*
1347 * Peer allocation may fail only in serious out-of-memory conditions. However
1348 * we still can generate some output.
1349 * Random ID selection looks a bit dangerous because we have no chances to
1350 * select ID being unique in a reasonable period of time.
1351 * But broken packet identifier may be better than no packet at all.
1352 */
1353static void ip_select_fb_ident(struct iphdr *iph)
1354{
1355 static DEFINE_SPINLOCK(ip_fb_id_lock);
1356 static u32 ip_fallback_id;
1357 u32 salt;
1358
1359 spin_lock_bh(&ip_fb_id_lock);
Al Viroe4485152006-09-26 22:15:01 -07001360 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001361 iph->id = htons(salt & 0xFFFF);
1362 ip_fallback_id = salt;
1363 spin_unlock_bh(&ip_fb_id_lock);
1364}
1365
1366void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1367{
1368 struct rtable *rt = (struct rtable *) dst;
1369
1370 if (rt) {
1371 if (rt->peer == NULL)
David S. Millera48eff12011-05-18 18:42:43 -04001372 rt_bind_peer(rt, rt->rt_dst, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001373
1374 /* If peer is attached to destination, it is never detached,
1375 so that we need not to grab a lock to dereference it.
1376 */
1377 if (rt->peer) {
1378 iph->id = htons(inet_getid(rt->peer, more));
1379 return;
1380 }
1381 } else
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001382 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
Stephen Hemminger9c2b3322005-04-19 22:39:42 -07001383 __builtin_return_address(0));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001384
1385 ip_select_fb_ident(iph);
1386}
Eric Dumazet4bc2f182010-07-09 21:22:10 +00001387EXPORT_SYMBOL(__ip_select_ident);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001388
1389static void rt_del(unsigned hash, struct rtable *rt)
1390{
Eric Dumazet1c317202010-10-25 21:02:07 +00001391 struct rtable __rcu **rthp;
1392 struct rtable *aux;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001393
Eric Dumazet29e75252008-01-31 17:05:09 -08001394 rthp = &rt_hash_table[hash].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001395 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001396 ip_rt_put(rt);
Eric Dumazet1c317202010-10-25 21:02:07 +00001397 while ((aux = rcu_dereference_protected(*rthp,
1398 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001399 if (aux == rt || rt_is_expired(aux)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07001400 *rthp = aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001401 rt_free(aux);
1402 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001403 }
Changli Gaod8d1f302010-06-10 23:31:35 -07001404 rthp = &aux->dst.rt_next;
Eric Dumazet29e75252008-01-31 17:05:09 -08001405 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001406 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001407}
1408
David S. Millerde398fb2011-12-05 13:21:42 -05001409static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001410{
1411 struct rtable *rt = (struct rtable *) dst;
1412 __be32 orig_gw = rt->rt_gateway;
1413 struct neighbour *n, *old_n;
1414
1415 dst_confirm(&rt->dst);
1416
1417 rt->rt_gateway = peer->redirect_learned.a4;
1418
1419 n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
David S. Millerde398fb2011-12-05 13:21:42 -05001420 if (IS_ERR(n)) {
1421 rt->rt_gateway = orig_gw;
1422 return;
1423 }
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001424 old_n = xchg(&rt->dst._neighbour, n);
1425 if (old_n)
1426 neigh_release(old_n);
David S. Millerde398fb2011-12-05 13:21:42 -05001427 if (!(n->nud_state & NUD_VALID)) {
1428 neigh_event_send(n, NULL);
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001429 } else {
1430 rt->rt_flags |= RTCF_REDIRECTED;
1431 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1432 }
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001433}
1434
Eric Dumazeted7865a42010-06-07 21:49:44 -07001435/* called in rcu_read_lock() section */
Al Virof7655222006-09-26 21:25:43 -07001436void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1437 __be32 saddr, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001438{
Flavio Leitner7cc91502011-10-24 02:56:38 -04001439 int s, i;
Eric Dumazeted7865a42010-06-07 21:49:44 -07001440 struct in_device *in_dev = __in_dev_get_rcu(dev);
Flavio Leitner7cc91502011-10-24 02:56:38 -04001441 __be32 skeys[2] = { saddr, 0 };
1442 int ikeys[2] = { dev->ifindex, 0 };
David S. Millerf39925d2011-02-09 22:00:16 -08001443 struct inet_peer *peer;
Denis V. Lunev317805b2008-02-28 20:50:06 -08001444 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001445
Linus Torvalds1da177e2005-04-16 15:20:36 -07001446 if (!in_dev)
1447 return;
1448
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001449 net = dev_net(dev);
Joe Perches9d4fb272009-11-23 10:41:23 -08001450 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1451 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1452 ipv4_is_zeronet(new_gw))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001453 goto reject_redirect;
1454
1455 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1456 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1457 goto reject_redirect;
1458 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1459 goto reject_redirect;
1460 } else {
Denis V. Lunev317805b2008-02-28 20:50:06 -08001461 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001462 goto reject_redirect;
1463 }
1464
Flavio Leitner7cc91502011-10-24 02:56:38 -04001465 for (s = 0; s < 2; s++) {
1466 for (i = 0; i < 2; i++) {
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001467 unsigned int hash;
1468 struct rtable __rcu **rthp;
1469 struct rtable *rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001470
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001471 hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1472
1473 rthp = &rt_hash_table[hash].chain;
1474
1475 while ((rt = rcu_dereference(*rthp)) != NULL) {
1476 rthp = &rt->dst.rt_next;
1477
1478 if (rt->rt_key_dst != daddr ||
1479 rt->rt_key_src != skeys[s] ||
1480 rt->rt_oif != ikeys[i] ||
1481 rt_is_input_route(rt) ||
1482 rt_is_expired(rt) ||
1483 !net_eq(dev_net(rt->dst.dev), net) ||
1484 rt->dst.error ||
1485 rt->dst.dev != dev ||
1486 rt->rt_gateway != old_gw)
1487 continue;
1488
1489 if (!rt->peer)
1490 rt_bind_peer(rt, rt->rt_dst, 1);
1491
1492 peer = rt->peer;
1493 if (peer) {
Eric Dumazetde68dca2011-11-26 12:13:44 +00001494 if (peer->redirect_learned.a4 != new_gw ||
1495 peer->redirect_genid != redirect_genid) {
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001496 peer->redirect_learned.a4 = new_gw;
Eric Dumazetde68dca2011-11-26 12:13:44 +00001497 peer->redirect_genid = redirect_genid;
Eric Dumazet9cc20b22011-11-18 15:24:32 -05001498 atomic_inc(&__rt_peer_genid);
1499 }
1500 check_peer_redir(&rt->dst, peer);
1501 }
Flavio Leitner7cc91502011-10-24 02:56:38 -04001502 }
Flavio Leitner7cc91502011-10-24 02:56:38 -04001503 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001504 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001505 return;
1506
1507reject_redirect:
1508#ifdef CONFIG_IP_ROUTE_VERBOSE
1509 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07001510 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1511 " Advised path = %pI4 -> %pI4\n",
1512 &old_gw, dev->name, &new_gw,
1513 &saddr, &daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001514#endif
Eric Dumazeted7865a42010-06-07 21:49:44 -07001515 ;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001516}
1517
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001518static bool peer_pmtu_expired(struct inet_peer *peer)
1519{
1520 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1521
1522 return orig &&
1523 time_after_eq(jiffies, orig) &&
1524 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1525}
1526
1527static bool peer_pmtu_cleaned(struct inet_peer *peer)
1528{
1529 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1530
1531 return orig &&
1532 cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1533}
1534
Linus Torvalds1da177e2005-04-16 15:20:36 -07001535static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1536{
Eric Dumazetee6b9672008-03-05 18:30:47 -08001537 struct rtable *rt = (struct rtable *)dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001538 struct dst_entry *ret = dst;
1539
1540 if (rt) {
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001541 if (dst->obsolete > 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001542 ip_rt_put(rt);
1543 ret = NULL;
David S. Miller2c8cec52011-02-09 20:42:07 -08001544 } else if (rt->rt_flags & RTCF_REDIRECTED) {
David S. Miller5e2b61f2011-03-04 21:47:09 -08001545 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1546 rt->rt_oif,
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001547 rt_genid(dev_net(dst->dev)));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001548 rt_del(hash, rt);
1549 ret = NULL;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001550 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1551 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001552 }
1553 }
1554 return ret;
1555}
1556
1557/*
1558 * Algorithm:
1559 * 1. The first ip_rt_redirect_number redirects are sent
1560 * with exponential backoff, then we stop sending them at all,
1561 * assuming that the host ignores our redirects.
1562 * 2. If we did not see packets requiring redirects
1563 * during ip_rt_redirect_silence, we assume that the host
1564 * forgot redirected route and start to send redirects again.
1565 *
1566 * This algorithm is much cheaper and more intelligent than dumb load limiting
1567 * in icmp.c.
1568 *
1569 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1570 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1571 */
1572
1573void ip_rt_send_redirect(struct sk_buff *skb)
1574{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001575 struct rtable *rt = skb_rtable(skb);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001576 struct in_device *in_dev;
David S. Miller92d86822011-02-04 15:55:25 -08001577 struct inet_peer *peer;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001578 int log_martians;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001579
Eric Dumazet30038fc2009-08-28 23:52:01 -07001580 rcu_read_lock();
Changli Gaod8d1f302010-06-10 23:31:35 -07001581 in_dev = __in_dev_get_rcu(rt->dst.dev);
Eric Dumazet30038fc2009-08-28 23:52:01 -07001582 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1583 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001584 return;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001585 }
1586 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1587 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001588
David S. Miller92d86822011-02-04 15:55:25 -08001589 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001590 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller92d86822011-02-04 15:55:25 -08001591 peer = rt->peer;
1592 if (!peer) {
1593 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1594 return;
1595 }
1596
Linus Torvalds1da177e2005-04-16 15:20:36 -07001597 /* No redirected packets during ip_rt_redirect_silence;
1598 * reset the algorithm.
1599 */
David S. Miller92d86822011-02-04 15:55:25 -08001600 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1601 peer->rate_tokens = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001602
1603 /* Too many ignored redirects; do not send anything
Changli Gaod8d1f302010-06-10 23:31:35 -07001604 * set dst.rate_last to the last seen redirected packet.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001605 */
David S. Miller92d86822011-02-04 15:55:25 -08001606 if (peer->rate_tokens >= ip_rt_redirect_number) {
1607 peer->rate_last = jiffies;
Eric Dumazet30038fc2009-08-28 23:52:01 -07001608 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001609 }
1610
1611 /* Check for load limit; set rate_last to the latest sent
1612 * redirect.
1613 */
David S. Miller92d86822011-02-04 15:55:25 -08001614 if (peer->rate_tokens == 0 ||
Li Yewang14fb8a72006-12-18 00:26:35 -08001615 time_after(jiffies,
David S. Miller92d86822011-02-04 15:55:25 -08001616 (peer->rate_last +
1617 (ip_rt_redirect_load << peer->rate_tokens)))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001618 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
David S. Miller92d86822011-02-04 15:55:25 -08001619 peer->rate_last = jiffies;
1620 ++peer->rate_tokens;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001621#ifdef CONFIG_IP_ROUTE_VERBOSE
Eric Dumazet30038fc2009-08-28 23:52:01 -07001622 if (log_martians &&
David S. Miller92d86822011-02-04 15:55:25 -08001623 peer->rate_tokens == ip_rt_redirect_number &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001624 net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07001625 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
David S. Millerc5be24f2011-05-13 18:01:21 -04001626 &ip_hdr(skb)->saddr, rt->rt_iif,
Harvey Harrison673d57e2008-10-31 00:53:57 -07001627 &rt->rt_dst, &rt->rt_gateway);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001628#endif
1629 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001630}
1631
1632static int ip_error(struct sk_buff *skb)
1633{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001634 struct rtable *rt = skb_rtable(skb);
David S. Miller92d86822011-02-04 15:55:25 -08001635 struct inet_peer *peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001636 unsigned long now;
David S. Miller92d86822011-02-04 15:55:25 -08001637 bool send;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001638 int code;
1639
Changli Gaod8d1f302010-06-10 23:31:35 -07001640 switch (rt->dst.error) {
Joe Perches4500ebf2011-07-01 09:43:07 +00001641 case EINVAL:
1642 default:
1643 goto out;
1644 case EHOSTUNREACH:
1645 code = ICMP_HOST_UNREACH;
1646 break;
1647 case ENETUNREACH:
1648 code = ICMP_NET_UNREACH;
1649 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1650 IPSTATS_MIB_INNOROUTES);
1651 break;
1652 case EACCES:
1653 code = ICMP_PKT_FILTERED;
1654 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001655 }
1656
David S. Miller92d86822011-02-04 15:55:25 -08001657 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001658 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller92d86822011-02-04 15:55:25 -08001659 peer = rt->peer;
1660
1661 send = true;
1662 if (peer) {
1663 now = jiffies;
1664 peer->rate_tokens += now - peer->rate_last;
1665 if (peer->rate_tokens > ip_rt_error_burst)
1666 peer->rate_tokens = ip_rt_error_burst;
1667 peer->rate_last = now;
1668 if (peer->rate_tokens >= ip_rt_error_cost)
1669 peer->rate_tokens -= ip_rt_error_cost;
1670 else
1671 send = false;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001672 }
David S. Miller92d86822011-02-04 15:55:25 -08001673 if (send)
1674 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001675
1676out: kfree_skb(skb);
1677 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001678}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001679
1680/*
1681 * The last two values are not from the RFC but
1682 * are needed for AMPRnet AX.25 paths.
1683 */
1684
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001685static const unsigned short mtu_plateau[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -07001686{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1687
Stephen Hemminger5969f712008-04-10 01:52:09 -07001688static inline unsigned short guess_mtu(unsigned short old_mtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001689{
1690 int i;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001691
Linus Torvalds1da177e2005-04-16 15:20:36 -07001692 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1693 if (old_mtu > mtu_plateau[i])
1694 return mtu_plateau[i];
1695 return 68;
1696}
1697
Eric Dumazetb71d1d42011-04-22 04:53:02 +00001698unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
Timo Teras0010e462008-04-29 03:32:25 -07001699 unsigned short new_mtu,
1700 struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001701{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001702 unsigned short old_mtu = ntohs(iph->tot_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001703 unsigned short est_mtu = 0;
David S. Miller2c8cec52011-02-09 20:42:07 -08001704 struct inet_peer *peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001705
David S. Miller2c8cec52011-02-09 20:42:07 -08001706 peer = inet_getpeer_v4(iph->daddr, 1);
1707 if (peer) {
1708 unsigned short mtu = new_mtu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001709
David S. Miller2c8cec52011-02-09 20:42:07 -08001710 if (new_mtu < 68 || new_mtu >= old_mtu) {
1711 /* BSD 4.2 derived systems incorrectly adjust
1712 * tot_len by the IP header length, and report
1713 * a zero MTU in the ICMP message.
1714 */
1715 if (mtu == 0 &&
1716 old_mtu >= 68 + (iph->ihl << 2))
1717 old_mtu -= iph->ihl << 2;
1718 mtu = guess_mtu(old_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001719 }
David S. Miller2c8cec52011-02-09 20:42:07 -08001720
1721 if (mtu < ip_rt_min_pmtu)
1722 mtu = ip_rt_min_pmtu;
1723 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001724 unsigned long pmtu_expires;
1725
1726 pmtu_expires = jiffies + ip_rt_mtu_expires;
1727 if (!pmtu_expires)
1728 pmtu_expires = 1UL;
1729
David S. Miller2c8cec52011-02-09 20:42:07 -08001730 est_mtu = mtu;
1731 peer->pmtu_learned = mtu;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001732 peer->pmtu_expires = pmtu_expires;
Gao feng59445b62011-10-19 15:34:09 +00001733 atomic_inc(&__rt_peer_genid);
David S. Miller2c8cec52011-02-09 20:42:07 -08001734 }
1735
1736 inet_putpeer(peer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001737 }
1738 return est_mtu ? : new_mtu;
1739}
1740
David S. Miller2c8cec52011-02-09 20:42:07 -08001741static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1742{
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001743 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
David S. Miller2c8cec52011-02-09 20:42:07 -08001744
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001745 if (!expires)
1746 return;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001747 if (time_before(jiffies, expires)) {
David S. Miller2c8cec52011-02-09 20:42:07 -08001748 u32 orig_dst_mtu = dst_mtu(dst);
1749 if (peer->pmtu_learned < orig_dst_mtu) {
1750 if (!peer->pmtu_orig)
1751 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1752 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1753 }
1754 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1755 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1756}
1757
Linus Torvalds1da177e2005-04-16 15:20:36 -07001758static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1759{
David S. Miller2c8cec52011-02-09 20:42:07 -08001760 struct rtable *rt = (struct rtable *) dst;
1761 struct inet_peer *peer;
1762
1763 dst_confirm(dst);
1764
1765 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001766 rt_bind_peer(rt, rt->rt_dst, 1);
David S. Miller2c8cec52011-02-09 20:42:07 -08001767 peer = rt->peer;
1768 if (peer) {
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001769 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1770
David S. Miller2c8cec52011-02-09 20:42:07 -08001771 if (mtu < ip_rt_min_pmtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001772 mtu = ip_rt_min_pmtu;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001773 if (!pmtu_expires || mtu < peer->pmtu_learned) {
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001774
1775 pmtu_expires = jiffies + ip_rt_mtu_expires;
1776 if (!pmtu_expires)
1777 pmtu_expires = 1UL;
1778
David S. Miller2c8cec52011-02-09 20:42:07 -08001779 peer->pmtu_learned = mtu;
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001780 peer->pmtu_expires = pmtu_expires;
David S. Miller2c8cec52011-02-09 20:42:07 -08001781
1782 atomic_inc(&__rt_peer_genid);
1783 rt->rt_peer_genid = rt_peer_genid();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001784 }
Hiroaki SHIMODA46af3182011-03-09 20:09:58 +00001785 check_peer_pmtu(dst, peer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001786 }
1787}
1788
David S. Millerf39925d2011-02-09 22:00:16 -08001789
David S. Millerde398fb2011-12-05 13:21:42 -05001790static void ipv4_validate_peer(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001791{
David S. Miller6431cbc2011-02-07 20:38:06 -08001792 if (rt->rt_peer_genid != rt_peer_genid()) {
David S. Miller2c8cec52011-02-09 20:42:07 -08001793 struct inet_peer *peer;
1794
David S. Miller6431cbc2011-02-07 20:38:06 -08001795 if (!rt->peer)
David S. Millera48eff12011-05-18 18:42:43 -04001796 rt_bind_peer(rt, rt->rt_dst, 0);
David S. Miller6431cbc2011-02-07 20:38:06 -08001797
David S. Miller2c8cec52011-02-09 20:42:07 -08001798 peer = rt->peer;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001799 if (peer) {
David S. Millerefbc3682011-12-01 13:38:59 -05001800 check_peer_pmtu(&rt->dst, peer);
David S. Miller2c8cec52011-02-09 20:42:07 -08001801
Eric Dumazetde68dca2011-11-26 12:13:44 +00001802 if (peer->redirect_genid != redirect_genid)
1803 peer->redirect_learned.a4 = 0;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001804 if (peer->redirect_learned.a4 &&
David S. Millerde398fb2011-12-05 13:21:42 -05001805 peer->redirect_learned.a4 != rt->rt_gateway)
1806 check_peer_redir(&rt->dst, peer);
David S. Millerf39925d2011-02-09 22:00:16 -08001807 }
1808
David S. Miller6431cbc2011-02-07 20:38:06 -08001809 rt->rt_peer_genid = rt_peer_genid();
1810 }
David S. Millerefbc3682011-12-01 13:38:59 -05001811}
1812
1813static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1814{
1815 struct rtable *rt = (struct rtable *) dst;
1816
1817 if (rt_is_expired(rt))
1818 return NULL;
David S. Millerde398fb2011-12-05 13:21:42 -05001819 ipv4_validate_peer(rt);
Timo Teräsd11a4dc2010-03-18 23:20:20 +00001820 return dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001821}
1822
1823static void ipv4_dst_destroy(struct dst_entry *dst)
1824{
1825 struct rtable *rt = (struct rtable *) dst;
1826 struct inet_peer *peer = rt->peer;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001827
David S. Miller62fa8a82011-01-26 20:51:05 -08001828 if (rt->fi) {
1829 fib_info_put(rt->fi);
1830 rt->fi = NULL;
1831 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001832 if (peer) {
1833 rt->peer = NULL;
1834 inet_putpeer(peer);
1835 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001836}
1837
Linus Torvalds1da177e2005-04-16 15:20:36 -07001838
1839static void ipv4_link_failure(struct sk_buff *skb)
1840{
1841 struct rtable *rt;
1842
1843 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1844
Eric Dumazet511c3f92009-06-02 05:14:27 +00001845 rt = skb_rtable(skb);
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001846 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1847 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001848}
1849
1850static int ip_rt_bug(struct sk_buff *skb)
1851{
Harvey Harrison673d57e2008-10-31 00:53:57 -07001852 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1853 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001854 skb->dev ? skb->dev->name : "?");
1855 kfree_skb(skb);
Dave Jonesc378a9c2011-05-21 07:16:42 +00001856 WARN_ON(1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001857 return 0;
1858}
1859
1860/*
1861 We do not cache source address of outgoing interface,
1862 because it is used only by IP RR, TS and SRR options,
1863 so that it out of fast path.
1864
1865 BTW remember: "addr" is allowed to be not aligned
1866 in IP options!
1867 */
1868
David S. Miller8e363602011-05-13 17:29:41 -04001869void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001870{
Al Viroa61ced52006-09-26 21:27:54 -07001871 __be32 src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001872
David S. Millerc7537962010-11-11 17:07:48 -08001873 if (rt_is_output_route(rt))
David S. Millerc5be24f2011-05-13 18:01:21 -04001874 src = ip_hdr(skb)->saddr;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001875 else {
David S. Miller8e363602011-05-13 17:29:41 -04001876 struct fib_result res;
1877 struct flowi4 fl4;
1878 struct iphdr *iph;
1879
1880 iph = ip_hdr(skb);
1881
1882 memset(&fl4, 0, sizeof(fl4));
1883 fl4.daddr = iph->daddr;
1884 fl4.saddr = iph->saddr;
Julian Anastasovb0fe4a32011-07-23 02:00:41 +00001885 fl4.flowi4_tos = RT_TOS(iph->tos);
David S. Miller8e363602011-05-13 17:29:41 -04001886 fl4.flowi4_oif = rt->dst.dev->ifindex;
1887 fl4.flowi4_iif = skb->dev->ifindex;
1888 fl4.flowi4_mark = skb->mark;
David S. Miller5e2b61f2011-03-04 21:47:09 -08001889
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001890 rcu_read_lock();
David S. Miller68a5e3d2011-03-11 20:07:33 -05001891 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
David S. Miller436c3b62011-03-24 17:42:21 -07001892 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001893 else
1894 src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001895 RT_SCOPE_UNIVERSE);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00001896 rcu_read_unlock();
1897 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001898 memcpy(addr, &src, 4);
1899}
1900
Patrick McHardyc7066f72011-01-14 13:36:42 +01001901#ifdef CONFIG_IP_ROUTE_CLASSID
Linus Torvalds1da177e2005-04-16 15:20:36 -07001902static void set_class_tag(struct rtable *rt, u32 tag)
1903{
Changli Gaod8d1f302010-06-10 23:31:35 -07001904 if (!(rt->dst.tclassid & 0xFFFF))
1905 rt->dst.tclassid |= tag & 0xFFFF;
1906 if (!(rt->dst.tclassid & 0xFFFF0000))
1907 rt->dst.tclassid |= tag & 0xFFFF0000;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001908}
1909#endif
1910
David S. Miller0dbaee32010-12-13 12:52:14 -08001911static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1912{
1913 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1914
1915 if (advmss == 0) {
1916 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1917 ip_rt_min_advmss);
1918 if (advmss > 65535 - 40)
1919 advmss = 65535 - 40;
1920 }
1921 return advmss;
1922}
1923
Steffen Klassertebb762f2011-11-23 02:12:51 +00001924static unsigned int ipv4_mtu(const struct dst_entry *dst)
David S. Millerd33e4552010-12-14 13:01:14 -08001925{
Steffen Klassert261663b2011-11-23 02:14:50 +00001926 const struct rtable *rt = (const struct rtable *) dst;
Steffen Klassert618f9bc2011-11-23 02:13:31 +00001927 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1928
Steffen Klassert261663b2011-11-23 02:14:50 +00001929 if (mtu && rt_is_output_route(rt))
Steffen Klassert618f9bc2011-11-23 02:13:31 +00001930 return mtu;
1931
1932 mtu = dst->dev->mtu;
David S. Millerd33e4552010-12-14 13:01:14 -08001933
1934 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
David S. Millerd33e4552010-12-14 13:01:14 -08001935
1936 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1937 mtu = 576;
1938 }
1939
1940 if (mtu > IP_MAX_MTU)
1941 mtu = IP_MAX_MTU;
1942
1943 return mtu;
1944}
1945
David S. Miller813b3b52011-04-28 14:48:42 -07001946static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
David S. Miller5e2b61f2011-03-04 21:47:09 -08001947 struct fib_info *fi)
David S. Millera4daad62011-01-27 22:01:53 -08001948{
David S. Miller0131ba42011-02-04 14:37:30 -08001949 struct inet_peer *peer;
1950 int create = 0;
1951
1952 /* If a peer entry exists for this destination, we must hook
1953 * it up in order to get at cached metrics.
1954 */
David S. Miller813b3b52011-04-28 14:48:42 -07001955 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
David S. Miller0131ba42011-02-04 14:37:30 -08001956 create = 1;
1957
David S. Miller3c0afdc2011-03-04 21:26:07 -08001958 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
David S. Miller0131ba42011-02-04 14:37:30 -08001959 if (peer) {
David S. Miller3c0afdc2011-03-04 21:26:07 -08001960 rt->rt_peer_genid = rt_peer_genid();
David S. Miller0131ba42011-02-04 14:37:30 -08001961 if (inet_metrics_new(peer))
1962 memcpy(peer->metrics, fi->fib_metrics,
1963 sizeof(u32) * RTAX_MAX);
1964 dst_init_metrics(&rt->dst, peer->metrics, false);
David S. Miller2c8cec52011-02-09 20:42:07 -08001965
Eric Dumazetfe6fe792011-06-08 06:07:07 +00001966 check_peer_pmtu(&rt->dst, peer);
Eric Dumazetde68dca2011-11-26 12:13:44 +00001967 if (peer->redirect_genid != redirect_genid)
1968 peer->redirect_learned.a4 = 0;
David S. Millerf39925d2011-02-09 22:00:16 -08001969 if (peer->redirect_learned.a4 &&
1970 peer->redirect_learned.a4 != rt->rt_gateway) {
1971 rt->rt_gateway = peer->redirect_learned.a4;
1972 rt->rt_flags |= RTCF_REDIRECTED;
1973 }
David S. Miller0131ba42011-02-04 14:37:30 -08001974 } else {
David S. Millerb8dad612011-01-28 14:07:16 -08001975 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1976 rt->fi = fi;
1977 atomic_inc(&fi->fib_clntref);
1978 }
David S. Millera4daad62011-01-27 22:01:53 -08001979 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
David S. Millera4daad62011-01-27 22:01:53 -08001980 }
1981}
1982
David S. Miller813b3b52011-04-28 14:48:42 -07001983static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
David S. Miller5e2b61f2011-03-04 21:47:09 -08001984 const struct fib_result *res,
David S. Miller982721f2011-02-16 21:44:24 -08001985 struct fib_info *fi, u16 type, u32 itag)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001986{
David S. Millerdefb3512010-12-08 21:16:57 -08001987 struct dst_entry *dst = &rt->dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001988
1989 if (fi) {
1990 if (FIB_RES_GW(*res) &&
1991 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1992 rt->rt_gateway = FIB_RES_GW(*res);
David S. Miller813b3b52011-04-28 14:48:42 -07001993 rt_init_metrics(rt, fl4, fi);
Patrick McHardyc7066f72011-01-14 13:36:42 +01001994#ifdef CONFIG_IP_ROUTE_CLASSID
David S. Millerdefb3512010-12-08 21:16:57 -08001995 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001996#endif
David S. Millerd33e4552010-12-14 13:01:14 -08001997 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001998
David S. Millerdefb3512010-12-08 21:16:57 -08001999 if (dst_mtu(dst) > IP_MAX_MTU)
2000 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
David S. Miller0dbaee32010-12-13 12:52:14 -08002001 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
David S. Millerdefb3512010-12-08 21:16:57 -08002002 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002003
Patrick McHardyc7066f72011-01-14 13:36:42 +01002004#ifdef CONFIG_IP_ROUTE_CLASSID
Linus Torvalds1da177e2005-04-16 15:20:36 -07002005#ifdef CONFIG_IP_MULTIPLE_TABLES
2006 set_class_tag(rt, fib_rules_tclass(res));
2007#endif
2008 set_class_tag(rt, itag);
2009#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002010}
2011
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002012static struct rtable *rt_dst_alloc(struct net_device *dev,
2013 bool nopolicy, bool noxfrm)
David S. Miller0c4dcd52011-02-17 15:42:37 -08002014{
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002015 return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2016 DST_HOST |
2017 (nopolicy ? DST_NOPOLICY : 0) |
2018 (noxfrm ? DST_NOXFRM : 0));
David S. Miller0c4dcd52011-02-17 15:42:37 -08002019}
2020
Eric Dumazet96d36222010-06-02 19:21:31 +00002021/* called in rcu_read_lock() section */
Al Viro9e12bb22006-09-26 21:25:20 -07002022static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002023 u8 tos, struct net_device *dev, int our)
2024{
Eric Dumazet96d36222010-06-02 19:21:31 +00002025 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002026 struct rtable *rth;
Al Viroa61ced52006-09-26 21:27:54 -07002027 __be32 spec_dst;
Eric Dumazet96d36222010-06-02 19:21:31 +00002028 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002029 u32 itag = 0;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002030 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002031
2032 /* Primary sanity checks. */
2033
2034 if (in_dev == NULL)
2035 return -EINVAL;
2036
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002037 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002038 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002039 goto e_inval;
2040
Joe Perchesf97c1e02007-12-16 13:45:43 -08002041 if (ipv4_is_zeronet(saddr)) {
2042 if (!ipv4_is_local_multicast(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002043 goto e_inval;
2044 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002045 } else {
Michael Smith5c04c812011-04-07 04:51:50 +00002046 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2047 &itag);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002048 if (err < 0)
2049 goto e_err;
2050 }
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002051 rth = rt_dst_alloc(init_net.loopback_dev,
2052 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002053 if (!rth)
2054 goto e_nobufs;
2055
Patrick McHardyc7066f72011-01-14 13:36:42 +01002056#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07002057 rth->dst.tclassid = itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002058#endif
David S. Millercf911662011-04-28 14:31:47 -07002059 rth->dst.output = ip_rt_bug;
2060
2061 rth->rt_key_dst = daddr;
2062 rth->rt_key_src = saddr;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002063 rth->rt_genid = rt_genid(dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002064 rth->rt_flags = RTCF_MULTICAST;
Eric Dumazet29e75252008-01-31 17:05:09 -08002065 rth->rt_type = RTN_MULTICAST;
David S. Miller475949d2011-05-03 19:45:15 -07002066 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002067 rth->rt_dst = daddr;
2068 rth->rt_src = saddr;
2069 rth->rt_route_iif = dev->ifindex;
2070 rth->rt_iif = dev->ifindex;
2071 rth->rt_oif = 0;
2072 rth->rt_mark = skb->mark;
2073 rth->rt_gateway = daddr;
2074 rth->rt_spec_dst= spec_dst;
2075 rth->rt_peer_genid = 0;
2076 rth->peer = NULL;
2077 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002078 if (our) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002079 rth->dst.input= ip_local_deliver;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002080 rth->rt_flags |= RTCF_LOCAL;
2081 }
2082
2083#ifdef CONFIG_IP_MROUTE
Joe Perchesf97c1e02007-12-16 13:45:43 -08002084 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
Changli Gaod8d1f302010-06-10 23:31:35 -07002085 rth->dst.input = ip_mr_input;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002086#endif
2087 RT_CACHE_STAT_INC(in_slow_mc);
2088
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002089 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
David S. Millerb23dd4f2011-03-02 14:31:35 -08002090 rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
Eric Dumazet9aa3c942011-06-18 11:59:18 -07002091 return IS_ERR(rth) ? PTR_ERR(rth) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002092
2093e_nobufs:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002094 return -ENOBUFS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002095e_inval:
Eric Dumazet96d36222010-06-02 19:21:31 +00002096 return -EINVAL;
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002097e_err:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002098 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002099}
2100
2101
2102static void ip_handle_martian_source(struct net_device *dev,
2103 struct in_device *in_dev,
2104 struct sk_buff *skb,
Al Viro9e12bb22006-09-26 21:25:20 -07002105 __be32 daddr,
2106 __be32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002107{
2108 RT_CACHE_STAT_INC(in_martian_src);
2109#ifdef CONFIG_IP_ROUTE_VERBOSE
2110 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2111 /*
2112 * RFC1812 recommendation, if source is martian,
2113 * the only hint is MAC header.
2114 */
Harvey Harrison673d57e2008-10-31 00:53:57 -07002115 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2116 &daddr, &saddr, dev->name);
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07002117 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002118 int i;
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07002119 const unsigned char *p = skb_mac_header(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002120 printk(KERN_WARNING "ll header: ");
2121 for (i = 0; i < dev->hard_header_len; i++, p++) {
2122 printk("%02x", *p);
2123 if (i < (dev->hard_header_len - 1))
2124 printk(":");
2125 }
2126 printk("\n");
2127 }
2128 }
2129#endif
2130}
2131
Eric Dumazet47360222010-06-03 04:13:21 +00002132/* called in rcu_read_lock() section */
Stephen Hemminger5969f712008-04-10 01:52:09 -07002133static int __mkroute_input(struct sk_buff *skb,
David S. Miller982721f2011-02-16 21:44:24 -08002134 const struct fib_result *res,
Stephen Hemminger5969f712008-04-10 01:52:09 -07002135 struct in_device *in_dev,
2136 __be32 daddr, __be32 saddr, u32 tos,
2137 struct rtable **result)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002138{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002139 struct rtable *rth;
2140 int err;
2141 struct in_device *out_dev;
Eric Dumazet47360222010-06-03 04:13:21 +00002142 unsigned int flags = 0;
Al Virod9c9df82006-09-26 21:28:14 -07002143 __be32 spec_dst;
2144 u32 itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002145
2146 /* get a working reference to the output device */
Eric Dumazet47360222010-06-03 04:13:21 +00002147 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002148 if (out_dev == NULL) {
2149 if (net_ratelimit())
2150 printk(KERN_CRIT "Bug in ip_route_input" \
2151 "_slow(). Please, report\n");
2152 return -EINVAL;
2153 }
2154
2155
Michael Smith5c04c812011-04-07 04:51:50 +00002156 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2157 in_dev->dev, &spec_dst, &itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002158 if (err < 0) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002159 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002160 saddr);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002161
Linus Torvalds1da177e2005-04-16 15:20:36 -07002162 goto cleanup;
2163 }
2164
2165 if (err)
2166 flags |= RTCF_DIRECTSRC;
2167
Thomas Graf51b77ca2008-06-03 16:36:01 -07002168 if (out_dev == in_dev && err &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002169 (IN_DEV_SHARED_MEDIA(out_dev) ||
2170 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2171 flags |= RTCF_DOREDIRECT;
2172
2173 if (skb->protocol != htons(ETH_P_IP)) {
2174 /* Not IP (i.e. ARP). Do not create route, if it is
2175 * invalid for proxy arp. DNAT routes are always valid.
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00002176 *
2177 * Proxy arp feature have been extended to allow, ARP
2178 * replies back to the same interface, to support
2179 * Private VLAN switch technologies. See arp.c.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002180 */
Jesper Dangaard Brouer65324142010-01-05 05:50:47 +00002181 if (out_dev == in_dev &&
2182 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002183 err = -EINVAL;
2184 goto cleanup;
2185 }
2186 }
2187
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002188 rth = rt_dst_alloc(out_dev->dev,
2189 IN_DEV_CONF_GET(in_dev, NOPOLICY),
David S. Miller0c4dcd52011-02-17 15:42:37 -08002190 IN_DEV_CONF_GET(out_dev, NOXFRM));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002191 if (!rth) {
2192 err = -ENOBUFS;
2193 goto cleanup;
2194 }
2195
David S. Miller5e2b61f2011-03-04 21:47:09 -08002196 rth->rt_key_dst = daddr;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002197 rth->rt_key_src = saddr;
David S. Millercf911662011-04-28 14:31:47 -07002198 rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2199 rth->rt_flags = flags;
2200 rth->rt_type = res->type;
David S. Miller475949d2011-05-03 19:45:15 -07002201 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002202 rth->rt_dst = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002203 rth->rt_src = saddr;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002204 rth->rt_route_iif = in_dev->dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002205 rth->rt_iif = in_dev->dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002206 rth->rt_oif = 0;
David S. Millercf911662011-04-28 14:31:47 -07002207 rth->rt_mark = skb->mark;
2208 rth->rt_gateway = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002209 rth->rt_spec_dst= spec_dst;
David S. Millercf911662011-04-28 14:31:47 -07002210 rth->rt_peer_genid = 0;
2211 rth->peer = NULL;
2212 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002213
Changli Gaod8d1f302010-06-10 23:31:35 -07002214 rth->dst.input = ip_forward;
2215 rth->dst.output = ip_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002216
David S. Miller5e2b61f2011-03-04 21:47:09 -08002217 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002218
Linus Torvalds1da177e2005-04-16 15:20:36 -07002219 *result = rth;
2220 err = 0;
2221 cleanup:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002222 return err;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002223}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002224
Stephen Hemminger5969f712008-04-10 01:52:09 -07002225static int ip_mkroute_input(struct sk_buff *skb,
2226 struct fib_result *res,
David S. Miller68a5e3d2011-03-11 20:07:33 -05002227 const struct flowi4 *fl4,
Stephen Hemminger5969f712008-04-10 01:52:09 -07002228 struct in_device *in_dev,
2229 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002230{
Chuck Short7abaa272005-06-22 22:10:23 -07002231 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002232 int err;
2233 unsigned hash;
2234
2235#ifdef CONFIG_IP_ROUTE_MULTIPATH
David S. Millerff3fccb2011-03-10 16:23:24 -08002236 if (res->fi && res->fi->fib_nhs > 1)
David S. Miller1b7fe5932011-03-10 17:01:16 -08002237 fib_select_multipath(res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002238#endif
2239
2240 /* create a routing cache entry */
2241 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2242 if (err)
2243 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002244
2245 /* put it into the cache */
David S. Miller68a5e3d2011-03-11 20:07:33 -05002246 hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
Changli Gaod8d1f302010-06-10 23:31:35 -07002247 rt_genid(dev_net(rth->dst.dev)));
David S. Miller68a5e3d2011-03-11 20:07:33 -05002248 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002249 if (IS_ERR(rth))
2250 return PTR_ERR(rth);
2251 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002252}
2253
Linus Torvalds1da177e2005-04-16 15:20:36 -07002254/*
2255 * NOTE. We drop all the packets that has local source
2256 * addresses, because every properly looped back packet
2257 * must have correct destination already attached by output routine.
2258 *
2259 * Such approach solves two big problems:
2260 * 1. Not simplex devices are handled properly.
2261 * 2. IP spoofing attempts are filtered with 100% of guarantee.
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002262 * called with rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -07002263 */
2264
Al Viro9e12bb22006-09-26 21:25:20 -07002265static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002266 u8 tos, struct net_device *dev)
2267{
2268 struct fib_result res;
Eric Dumazet96d36222010-06-02 19:21:31 +00002269 struct in_device *in_dev = __in_dev_get_rcu(dev);
David S. Miller68a5e3d2011-03-11 20:07:33 -05002270 struct flowi4 fl4;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002271 unsigned flags = 0;
2272 u32 itag = 0;
2273 struct rtable * rth;
2274 unsigned hash;
Al Viro9e12bb22006-09-26 21:25:20 -07002275 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002276 int err = -EINVAL;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002277 struct net * net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002278
2279 /* IP on this device is disabled. */
2280
2281 if (!in_dev)
2282 goto out;
2283
2284 /* Check for the most weird martians, which can be not detected
2285 by fib_lookup.
2286 */
2287
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002288 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002289 ipv4_is_loopback(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002290 goto martian_source;
2291
Andy Walls27a954b2010-10-17 15:11:22 +00002292 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002293 goto brd_input;
2294
2295 /* Accept zero addresses only to limited broadcast;
2296 * I even do not know to fix it or not. Waiting for complains :-)
2297 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002298 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002299 goto martian_source;
2300
Andy Walls27a954b2010-10-17 15:11:22 +00002301 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002302 goto martian_destination;
2303
2304 /*
2305 * Now we are ready to route packet.
2306 */
David S. Miller68a5e3d2011-03-11 20:07:33 -05002307 fl4.flowi4_oif = 0;
2308 fl4.flowi4_iif = dev->ifindex;
2309 fl4.flowi4_mark = skb->mark;
2310 fl4.flowi4_tos = tos;
2311 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2312 fl4.daddr = daddr;
2313 fl4.saddr = saddr;
2314 err = fib_lookup(net, &fl4, &res);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002315 if (err != 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002316 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002317 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002318 goto no_route;
2319 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002320
2321 RT_CACHE_STAT_INC(in_slow_tot);
2322
2323 if (res.type == RTN_BROADCAST)
2324 goto brd_input;
2325
2326 if (res.type == RTN_LOCAL) {
Michael Smith5c04c812011-04-07 04:51:50 +00002327 err = fib_validate_source(skb, saddr, daddr, tos,
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002328 net->loopback_dev->ifindex,
Michael Smith5c04c812011-04-07 04:51:50 +00002329 dev, &spec_dst, &itag);
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002330 if (err < 0)
2331 goto martian_source_keep_err;
2332 if (err)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002333 flags |= RTCF_DIRECTSRC;
2334 spec_dst = daddr;
2335 goto local_input;
2336 }
2337
2338 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002339 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002340 if (res.type != RTN_UNICAST)
2341 goto martian_destination;
2342
David S. Miller68a5e3d2011-03-11 20:07:33 -05002343 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002344out: return err;
2345
2346brd_input:
2347 if (skb->protocol != htons(ETH_P_IP))
2348 goto e_inval;
2349
Joe Perchesf97c1e02007-12-16 13:45:43 -08002350 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002351 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2352 else {
Michael Smith5c04c812011-04-07 04:51:50 +00002353 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2354 &itag);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002355 if (err < 0)
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002356 goto martian_source_keep_err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002357 if (err)
2358 flags |= RTCF_DIRECTSRC;
2359 }
2360 flags |= RTCF_BROADCAST;
2361 res.type = RTN_BROADCAST;
2362 RT_CACHE_STAT_INC(in_brd);
2363
2364local_input:
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002365 rth = rt_dst_alloc(net->loopback_dev,
2366 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002367 if (!rth)
2368 goto e_nobufs;
2369
David S. Millercf911662011-04-28 14:31:47 -07002370 rth->dst.input= ip_local_deliver;
Changli Gaod8d1f302010-06-10 23:31:35 -07002371 rth->dst.output= ip_rt_bug;
David S. Millercf911662011-04-28 14:31:47 -07002372#ifdef CONFIG_IP_ROUTE_CLASSID
2373 rth->dst.tclassid = itag;
2374#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002375
David S. Miller5e2b61f2011-03-04 21:47:09 -08002376 rth->rt_key_dst = daddr;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002377 rth->rt_key_src = saddr;
David S. Millercf911662011-04-28 14:31:47 -07002378 rth->rt_genid = rt_genid(net);
2379 rth->rt_flags = flags|RTCF_LOCAL;
2380 rth->rt_type = res.type;
David S. Miller475949d2011-05-03 19:45:15 -07002381 rth->rt_key_tos = tos;
David S. Millercf911662011-04-28 14:31:47 -07002382 rth->rt_dst = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002383 rth->rt_src = saddr;
Patrick McHardyc7066f72011-01-14 13:36:42 +01002384#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07002385 rth->dst.tclassid = itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002386#endif
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002387 rth->rt_route_iif = dev->ifindex;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002388 rth->rt_iif = dev->ifindex;
David S. Millercf911662011-04-28 14:31:47 -07002389 rth->rt_oif = 0;
2390 rth->rt_mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002391 rth->rt_gateway = daddr;
2392 rth->rt_spec_dst= spec_dst;
David S. Millercf911662011-04-28 14:31:47 -07002393 rth->rt_peer_genid = 0;
2394 rth->peer = NULL;
2395 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002396 if (res.type == RTN_UNREACHABLE) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002397 rth->dst.input= ip_error;
2398 rth->dst.error= -err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002399 rth->rt_flags &= ~RTCF_LOCAL;
2400 }
David S. Miller68a5e3d2011-03-11 20:07:33 -05002401 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2402 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002403 err = 0;
2404 if (IS_ERR(rth))
2405 err = PTR_ERR(rth);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002406 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002407
2408no_route:
2409 RT_CACHE_STAT_INC(in_no_route);
2410 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2411 res.type = RTN_UNREACHABLE;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08002412 if (err == -ESRCH)
2413 err = -ENETUNREACH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002414 goto local_input;
2415
2416 /*
2417 * Do not cache martian addresses: they should be logged (RFC1812)
2418 */
2419martian_destination:
2420 RT_CACHE_STAT_INC(in_martian_dst);
2421#ifdef CONFIG_IP_ROUTE_VERBOSE
2422 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07002423 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2424 &daddr, &saddr, dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002425#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002426
2427e_hostunreach:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002428 err = -EHOSTUNREACH;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002429 goto out;
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002430
Linus Torvalds1da177e2005-04-16 15:20:36 -07002431e_inval:
2432 err = -EINVAL;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002433 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002434
2435e_nobufs:
2436 err = -ENOBUFS;
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002437 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002438
2439martian_source:
Eric Dumazetb5f7e752010-06-02 12:05:27 +00002440 err = -EINVAL;
2441martian_source_keep_err:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002442 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002443 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002444}
2445
Eric Dumazet407eadd2010-05-10 11:32:55 +00002446int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2447 u8 tos, struct net_device *dev, bool noref)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002448{
2449 struct rtable * rth;
2450 unsigned hash;
2451 int iif = dev->ifindex;
Denis V. Lunevb5921912008-01-22 23:50:25 -08002452 struct net *net;
Eric Dumazet96d36222010-06-02 19:21:31 +00002453 int res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002454
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002455 net = dev_net(dev);
Neil Horman1080d702008-10-27 12:28:25 -07002456
Eric Dumazet96d36222010-06-02 19:21:31 +00002457 rcu_read_lock();
2458
Neil Horman1080d702008-10-27 12:28:25 -07002459 if (!rt_caching(net))
2460 goto skip_cache;
2461
Linus Torvalds1da177e2005-04-16 15:20:36 -07002462 tos &= IPTOS_RT_MASK;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002463 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002464
Linus Torvalds1da177e2005-04-16 15:20:36 -07002465 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002466 rth = rcu_dereference(rth->dst.rt_next)) {
David S. Miller5e2b61f2011-03-04 21:47:09 -08002467 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2468 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
Julian Anastasov97a80412011-08-09 04:01:16 +00002469 (rth->rt_route_iif ^ iif) |
David S. Miller475949d2011-05-03 19:45:15 -07002470 (rth->rt_key_tos ^ tos)) == 0 &&
David S. Miller5e2b61f2011-03-04 21:47:09 -08002471 rth->rt_mark == skb->mark &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002472 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002473 !rt_is_expired(rth)) {
David S. Millerde398fb2011-12-05 13:21:42 -05002474 ipv4_validate_peer(rth);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002475 if (noref) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002476 dst_use_noref(&rth->dst, jiffies);
2477 skb_dst_set_noref(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002478 } else {
Changli Gaod8d1f302010-06-10 23:31:35 -07002479 dst_use(&rth->dst, jiffies);
2480 skb_dst_set(skb, &rth->dst);
Eric Dumazet407eadd2010-05-10 11:32:55 +00002481 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002482 RT_CACHE_STAT_INC(in_hit);
2483 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002484 return 0;
2485 }
2486 RT_CACHE_STAT_INC(in_hlist_search);
2487 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002488
Neil Horman1080d702008-10-27 12:28:25 -07002489skip_cache:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002490 /* Multicast recognition logic is moved from route cache to here.
2491 The problem was that too many Ethernet cards have broken/missing
2492 hardware multicast filters :-( As result the host on multicasting
2493 network acquires a lot of useless route cache entries, sort of
2494 SDR messages from all the world. Now we try to get rid of them.
2495 Really, provided software IP multicast filter is organized
2496 reasonably (at least, hashed), it does not result in a slowdown
2497 comparing with route cache reject entries.
2498 Note, that multicast routers are not affected, because
2499 route cache entry is created eventually.
2500 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002501 if (ipv4_is_multicast(daddr)) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002502 struct in_device *in_dev = __in_dev_get_rcu(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002503
Eric Dumazet96d36222010-06-02 19:21:31 +00002504 if (in_dev) {
David S. Millerdbdd9a52011-03-10 16:34:38 -08002505 int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2506 ip_hdr(skb)->protocol);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002507 if (our
2508#ifdef CONFIG_IP_MROUTE
Joe Perches9d4fb272009-11-23 10:41:23 -08002509 ||
2510 (!ipv4_is_local_multicast(daddr) &&
2511 IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002512#endif
Joe Perches9d4fb272009-11-23 10:41:23 -08002513 ) {
Eric Dumazet96d36222010-06-02 19:21:31 +00002514 int res = ip_route_input_mc(skb, daddr, saddr,
2515 tos, dev, our);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002516 rcu_read_unlock();
Eric Dumazet96d36222010-06-02 19:21:31 +00002517 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002518 }
2519 }
2520 rcu_read_unlock();
2521 return -EINVAL;
2522 }
Eric Dumazet96d36222010-06-02 19:21:31 +00002523 res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2524 rcu_read_unlock();
2525 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002526}
Eric Dumazet407eadd2010-05-10 11:32:55 +00002527EXPORT_SYMBOL(ip_route_input_common);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002528
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002529/* called with rcu_read_lock() */
David S. Miller982721f2011-02-16 21:44:24 -08002530static struct rtable *__mkroute_output(const struct fib_result *res,
David S. Miller68a5e3d2011-03-11 20:07:33 -05002531 const struct flowi4 *fl4,
David S. Miller813b3b52011-04-28 14:48:42 -07002532 __be32 orig_daddr, __be32 orig_saddr,
Julian Anastasovf61759e2011-12-02 11:39:42 +00002533 int orig_oif, __u8 orig_rtos,
2534 struct net_device *dev_out,
David S. Miller5ada5522011-02-17 15:29:00 -08002535 unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002536{
David S. Miller982721f2011-02-16 21:44:24 -08002537 struct fib_info *fi = res->fi;
David S. Miller5ada5522011-02-17 15:29:00 -08002538 struct in_device *in_dev;
David S. Miller982721f2011-02-16 21:44:24 -08002539 u16 type = res->type;
David S. Miller5ada5522011-02-17 15:29:00 -08002540 struct rtable *rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002541
David S. Miller68a5e3d2011-03-11 20:07:33 -05002542 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
David S. Miller5ada5522011-02-17 15:29:00 -08002543 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002544
David S. Miller68a5e3d2011-03-11 20:07:33 -05002545 if (ipv4_is_lbcast(fl4->daddr))
David S. Miller982721f2011-02-16 21:44:24 -08002546 type = RTN_BROADCAST;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002547 else if (ipv4_is_multicast(fl4->daddr))
David S. Miller982721f2011-02-16 21:44:24 -08002548 type = RTN_MULTICAST;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002549 else if (ipv4_is_zeronet(fl4->daddr))
David S. Miller5ada5522011-02-17 15:29:00 -08002550 return ERR_PTR(-EINVAL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002551
2552 if (dev_out->flags & IFF_LOOPBACK)
2553 flags |= RTCF_LOCAL;
2554
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002555 in_dev = __in_dev_get_rcu(dev_out);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002556 if (!in_dev)
David S. Miller5ada5522011-02-17 15:29:00 -08002557 return ERR_PTR(-EINVAL);
Eric Dumazetebc0ffa2010-10-05 10:41:36 +00002558
David S. Miller982721f2011-02-16 21:44:24 -08002559 if (type == RTN_BROADCAST) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002560 flags |= RTCF_BROADCAST | RTCF_LOCAL;
David S. Miller982721f2011-02-16 21:44:24 -08002561 fi = NULL;
2562 } else if (type == RTN_MULTICAST) {
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002563 flags |= RTCF_MULTICAST | RTCF_LOCAL;
David S. Miller813b3b52011-04-28 14:48:42 -07002564 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2565 fl4->flowi4_proto))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002566 flags &= ~RTCF_LOCAL;
2567 /* If multicast route do not exist use
Eric Dumazetdd28d1a2010-09-29 11:53:50 +00002568 * default one, but do not gateway in this case.
2569 * Yes, it is hack.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002570 */
David S. Miller982721f2011-02-16 21:44:24 -08002571 if (fi && res->prefixlen < 4)
2572 fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002573 }
2574
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002575 rth = rt_dst_alloc(dev_out,
2576 IN_DEV_CONF_GET(in_dev, NOPOLICY),
David S. Miller0c4dcd52011-02-17 15:42:37 -08002577 IN_DEV_CONF_GET(in_dev, NOXFRM));
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002578 if (!rth)
David S. Miller5ada5522011-02-17 15:29:00 -08002579 return ERR_PTR(-ENOBUFS);
Dimitris Michailidis8391d072010-10-07 14:48:38 +00002580
David S. Millercf911662011-04-28 14:31:47 -07002581 rth->dst.output = ip_output;
2582
David S. Miller813b3b52011-04-28 14:48:42 -07002583 rth->rt_key_dst = orig_daddr;
2584 rth->rt_key_src = orig_saddr;
David S. Millercf911662011-04-28 14:31:47 -07002585 rth->rt_genid = rt_genid(dev_net(dev_out));
2586 rth->rt_flags = flags;
2587 rth->rt_type = type;
Julian Anastasovf61759e2011-12-02 11:39:42 +00002588 rth->rt_key_tos = orig_rtos;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002589 rth->rt_dst = fl4->daddr;
2590 rth->rt_src = fl4->saddr;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002591 rth->rt_route_iif = 0;
David S. Miller813b3b52011-04-28 14:48:42 -07002592 rth->rt_iif = orig_oif ? : dev_out->ifindex;
2593 rth->rt_oif = orig_oif;
2594 rth->rt_mark = fl4->flowi4_mark;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002595 rth->rt_gateway = fl4->daddr;
2596 rth->rt_spec_dst= fl4->saddr;
David S. Millercf911662011-04-28 14:31:47 -07002597 rth->rt_peer_genid = 0;
2598 rth->peer = NULL;
2599 rth->fi = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002600
2601 RT_CACHE_STAT_INC(out_slow_tot);
2602
2603 if (flags & RTCF_LOCAL) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002604 rth->dst.input = ip_local_deliver;
David S. Miller68a5e3d2011-03-11 20:07:33 -05002605 rth->rt_spec_dst = fl4->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002606 }
2607 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
David S. Miller68a5e3d2011-03-11 20:07:33 -05002608 rth->rt_spec_dst = fl4->saddr;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002609 if (flags & RTCF_LOCAL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002610 !(dev_out->flags & IFF_LOOPBACK)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002611 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002612 RT_CACHE_STAT_INC(out_slow_mc);
2613 }
2614#ifdef CONFIG_IP_MROUTE
David S. Miller982721f2011-02-16 21:44:24 -08002615 if (type == RTN_MULTICAST) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002616 if (IN_DEV_MFORWARD(in_dev) &&
David S. Miller813b3b52011-04-28 14:48:42 -07002617 !ipv4_is_local_multicast(fl4->daddr)) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002618 rth->dst.input = ip_mr_input;
2619 rth->dst.output = ip_mc_output;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002620 }
2621 }
2622#endif
2623 }
2624
David S. Miller813b3b52011-04-28 14:48:42 -07002625 rt_set_nexthop(rth, fl4, res, fi, type, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002626
David S. Miller5ada5522011-02-17 15:29:00 -08002627 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002628}
2629
Linus Torvalds1da177e2005-04-16 15:20:36 -07002630/*
2631 * Major route resolver routine.
Eric Dumazet0197aa32010-09-30 03:33:58 +00002632 * called with rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002633 */
2634
David S. Miller813b3b52011-04-28 14:48:42 -07002635static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002636{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002637 struct net_device *dev_out = NULL;
Julian Anastasovf61759e2011-12-02 11:39:42 +00002638 __u8 tos = RT_FL_TOS(fl4);
David S. Miller813b3b52011-04-28 14:48:42 -07002639 unsigned int flags = 0;
2640 struct fib_result res;
David S. Miller5ada5522011-02-17 15:29:00 -08002641 struct rtable *rth;
David S. Miller813b3b52011-04-28 14:48:42 -07002642 __be32 orig_daddr;
2643 __be32 orig_saddr;
2644 int orig_oif;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002645
2646 res.fi = NULL;
2647#ifdef CONFIG_IP_MULTIPLE_TABLES
2648 res.r = NULL;
2649#endif
2650
David S. Miller813b3b52011-04-28 14:48:42 -07002651 orig_daddr = fl4->daddr;
2652 orig_saddr = fl4->saddr;
2653 orig_oif = fl4->flowi4_oif;
2654
2655 fl4->flowi4_iif = net->loopback_dev->ifindex;
2656 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2657 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2658 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
David S. Miller44713b62011-03-04 21:24:47 -08002659
David S. Miller010c2702011-02-17 15:37:09 -08002660 rcu_read_lock();
David S. Miller813b3b52011-04-28 14:48:42 -07002661 if (fl4->saddr) {
David S. Millerb23dd4f2011-03-02 14:31:35 -08002662 rth = ERR_PTR(-EINVAL);
David S. Miller813b3b52011-04-28 14:48:42 -07002663 if (ipv4_is_multicast(fl4->saddr) ||
2664 ipv4_is_lbcast(fl4->saddr) ||
2665 ipv4_is_zeronet(fl4->saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002666 goto out;
2667
Linus Torvalds1da177e2005-04-16 15:20:36 -07002668 /* I removed check for oif == dev_out->oif here.
2669 It was wrong for two reasons:
Denis V. Lunev1ab35272008-01-22 22:04:30 -08002670 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2671 is assigned to multiple interfaces.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002672 2. Moreover, we are allowed to send packets with saddr
2673 of another iface. --ANK
2674 */
2675
David S. Miller813b3b52011-04-28 14:48:42 -07002676 if (fl4->flowi4_oif == 0 &&
2677 (ipv4_is_multicast(fl4->daddr) ||
2678 ipv4_is_lbcast(fl4->daddr))) {
Julian Anastasova210d012008-10-01 07:28:28 -07002679 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
David S. Miller813b3b52011-04-28 14:48:42 -07002680 dev_out = __ip_dev_find(net, fl4->saddr, false);
Julian Anastasova210d012008-10-01 07:28:28 -07002681 if (dev_out == NULL)
2682 goto out;
2683
Linus Torvalds1da177e2005-04-16 15:20:36 -07002684 /* Special hack: user can direct multicasts
2685 and limited broadcast via necessary interface
2686 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2687 This hack is not just for fun, it allows
2688 vic,vat and friends to work.
2689 They bind socket to loopback, set ttl to zero
2690 and expect that it will work.
2691 From the viewpoint of routing cache they are broken,
2692 because we are not allowed to build multicast path
2693 with loopback source addr (look, routing cache
2694 cannot know, that ttl is zero, so that packet
2695 will not leave this host and route is valid).
2696 Luckily, this hack is good workaround.
2697 */
2698
David S. Miller813b3b52011-04-28 14:48:42 -07002699 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002700 goto make_route;
2701 }
Julian Anastasova210d012008-10-01 07:28:28 -07002702
David S. Miller813b3b52011-04-28 14:48:42 -07002703 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
Julian Anastasova210d012008-10-01 07:28:28 -07002704 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
David S. Miller813b3b52011-04-28 14:48:42 -07002705 if (!__ip_dev_find(net, fl4->saddr, false))
Julian Anastasova210d012008-10-01 07:28:28 -07002706 goto out;
Julian Anastasova210d012008-10-01 07:28:28 -07002707 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002708 }
2709
2710
David S. Miller813b3b52011-04-28 14:48:42 -07002711 if (fl4->flowi4_oif) {
2712 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002713 rth = ERR_PTR(-ENODEV);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002714 if (dev_out == NULL)
2715 goto out;
Herbert Xue5ed6392005-10-03 14:35:55 -07002716
2717 /* RACE: Check return value of inet_select_addr instead. */
Eric Dumazetfc75fc82010-12-22 04:39:39 +00002718 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
David S. Millerb23dd4f2011-03-02 14:31:35 -08002719 rth = ERR_PTR(-ENETUNREACH);
Eric Dumazetfc75fc82010-12-22 04:39:39 +00002720 goto out;
2721 }
David S. Miller813b3b52011-04-28 14:48:42 -07002722 if (ipv4_is_local_multicast(fl4->daddr) ||
2723 ipv4_is_lbcast(fl4->daddr)) {
2724 if (!fl4->saddr)
2725 fl4->saddr = inet_select_addr(dev_out, 0,
2726 RT_SCOPE_LINK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002727 goto make_route;
2728 }
David S. Miller813b3b52011-04-28 14:48:42 -07002729 if (fl4->saddr) {
2730 if (ipv4_is_multicast(fl4->daddr))
2731 fl4->saddr = inet_select_addr(dev_out, 0,
2732 fl4->flowi4_scope);
2733 else if (!fl4->daddr)
2734 fl4->saddr = inet_select_addr(dev_out, 0,
2735 RT_SCOPE_HOST);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002736 }
2737 }
2738
David S. Miller813b3b52011-04-28 14:48:42 -07002739 if (!fl4->daddr) {
2740 fl4->daddr = fl4->saddr;
2741 if (!fl4->daddr)
2742 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002743 dev_out = net->loopback_dev;
David S. Miller813b3b52011-04-28 14:48:42 -07002744 fl4->flowi4_oif = net->loopback_dev->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002745 res.type = RTN_LOCAL;
2746 flags |= RTCF_LOCAL;
2747 goto make_route;
2748 }
2749
David S. Miller813b3b52011-04-28 14:48:42 -07002750 if (fib_lookup(net, fl4, &res)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002751 res.fi = NULL;
David S. Miller813b3b52011-04-28 14:48:42 -07002752 if (fl4->flowi4_oif) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002753 /* Apparently, routing tables are wrong. Assume,
2754 that the destination is on link.
2755
2756 WHY? DW.
2757 Because we are allowed to send to iface
2758 even if it has NO routes and NO assigned
2759 addresses. When oif is specified, routing
2760 tables are looked up with only one purpose:
2761 to catch if destination is gatewayed, rather than
2762 direct. Moreover, if MSG_DONTROUTE is set,
2763 we send packet, ignoring both routing tables
2764 and ifaddr state. --ANK
2765
2766
2767 We could make it even if oif is unknown,
2768 likely IPv6, but we do not.
2769 */
2770
David S. Miller813b3b52011-04-28 14:48:42 -07002771 if (fl4->saddr == 0)
2772 fl4->saddr = inet_select_addr(dev_out, 0,
2773 RT_SCOPE_LINK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002774 res.type = RTN_UNICAST;
2775 goto make_route;
2776 }
David S. Millerb23dd4f2011-03-02 14:31:35 -08002777 rth = ERR_PTR(-ENETUNREACH);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002778 goto out;
2779 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002780
2781 if (res.type == RTN_LOCAL) {
David S. Miller813b3b52011-04-28 14:48:42 -07002782 if (!fl4->saddr) {
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002783 if (res.fi->fib_prefsrc)
David S. Miller813b3b52011-04-28 14:48:42 -07002784 fl4->saddr = res.fi->fib_prefsrc;
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002785 else
David S. Miller813b3b52011-04-28 14:48:42 -07002786 fl4->saddr = fl4->daddr;
Joel Sing9fc3bbb2011-01-03 20:24:20 +00002787 }
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002788 dev_out = net->loopback_dev;
David S. Miller813b3b52011-04-28 14:48:42 -07002789 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002790 res.fi = NULL;
2791 flags |= RTCF_LOCAL;
2792 goto make_route;
2793 }
2794
2795#ifdef CONFIG_IP_ROUTE_MULTIPATH
David S. Miller813b3b52011-04-28 14:48:42 -07002796 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
David S. Miller1b7fe5932011-03-10 17:01:16 -08002797 fib_select_multipath(&res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002798 else
2799#endif
David S. Miller21d8c492011-04-14 14:49:37 -07002800 if (!res.prefixlen &&
2801 res.table->tb_num_default > 1 &&
David S. Miller813b3b52011-04-28 14:48:42 -07002802 res.type == RTN_UNICAST && !fl4->flowi4_oif)
David S. Miller0c838ff2011-01-31 16:16:50 -08002803 fib_select_default(&res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002804
David S. Miller813b3b52011-04-28 14:48:42 -07002805 if (!fl4->saddr)
2806 fl4->saddr = FIB_RES_PREFSRC(net, res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002807
Linus Torvalds1da177e2005-04-16 15:20:36 -07002808 dev_out = FIB_RES_DEV(res);
David S. Miller813b3b52011-04-28 14:48:42 -07002809 fl4->flowi4_oif = dev_out->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002810
2811
2812make_route:
David S. Miller813b3b52011-04-28 14:48:42 -07002813 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
Julian Anastasovf61759e2011-12-02 11:39:42 +00002814 tos, dev_out, flags);
David S. Millerb23dd4f2011-03-02 14:31:35 -08002815 if (!IS_ERR(rth)) {
David S. Miller5ada5522011-02-17 15:29:00 -08002816 unsigned int hash;
2817
David S. Miller813b3b52011-04-28 14:48:42 -07002818 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
David S. Miller5ada5522011-02-17 15:29:00 -08002819 rt_genid(dev_net(dev_out)));
David S. Miller813b3b52011-04-28 14:48:42 -07002820 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
David S. Miller5ada5522011-02-17 15:29:00 -08002821 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002822
David S. Miller010c2702011-02-17 15:37:09 -08002823out:
2824 rcu_read_unlock();
David S. Millerb23dd4f2011-03-02 14:31:35 -08002825 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002826}
2827
David S. Miller813b3b52011-04-28 14:48:42 -07002828struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002829{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002830 struct rtable *rth;
David S. Miller010c2702011-02-17 15:37:09 -08002831 unsigned int hash;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002832
Neil Horman1080d702008-10-27 12:28:25 -07002833 if (!rt_caching(net))
2834 goto slow_output;
2835
David S. Miller9d6ec932011-03-12 01:12:47 -05002836 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002837
2838 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08002839 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
Changli Gaod8d1f302010-06-10 23:31:35 -07002840 rth = rcu_dereference_bh(rth->dst.rt_next)) {
David S. Miller9d6ec932011-03-12 01:12:47 -05002841 if (rth->rt_key_dst == flp4->daddr &&
2842 rth->rt_key_src == flp4->saddr &&
David S. Millerc7537962010-11-11 17:07:48 -08002843 rt_is_output_route(rth) &&
David S. Miller9d6ec932011-03-12 01:12:47 -05002844 rth->rt_oif == flp4->flowi4_oif &&
2845 rth->rt_mark == flp4->flowi4_mark &&
David S. Miller475949d2011-05-03 19:45:15 -07002846 !((rth->rt_key_tos ^ flp4->flowi4_tos) &
Denis V. Lunevb5921912008-01-22 23:50:25 -08002847 (IPTOS_RT_MASK | RTO_ONLINK)) &&
Changli Gaod8d1f302010-06-10 23:31:35 -07002848 net_eq(dev_net(rth->dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002849 !rt_is_expired(rth)) {
David S. Millerde398fb2011-12-05 13:21:42 -05002850 ipv4_validate_peer(rth);
Changli Gaod8d1f302010-06-10 23:31:35 -07002851 dst_use(&rth->dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002852 RT_CACHE_STAT_INC(out_hit);
2853 rcu_read_unlock_bh();
David S. Miller56157872011-05-02 14:37:45 -07002854 if (!flp4->saddr)
2855 flp4->saddr = rth->rt_src;
2856 if (!flp4->daddr)
2857 flp4->daddr = rth->rt_dst;
David S. Millerb23dd4f2011-03-02 14:31:35 -08002858 return rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002859 }
2860 RT_CACHE_STAT_INC(out_hlist_search);
2861 }
2862 rcu_read_unlock_bh();
2863
Neil Horman1080d702008-10-27 12:28:25 -07002864slow_output:
David S. Miller9d6ec932011-03-12 01:12:47 -05002865 return ip_route_output_slow(net, flp4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002866}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002867EXPORT_SYMBOL_GPL(__ip_route_output_key);
2868
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002869static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2870{
2871 return NULL;
2872}
2873
Steffen Klassertebb762f2011-11-23 02:12:51 +00002874static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
Roland Dreierec831ea2011-01-31 13:16:00 -08002875{
Steffen Klassert618f9bc2011-11-23 02:13:31 +00002876 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2877
2878 return mtu ? : dst->dev->mtu;
Roland Dreierec831ea2011-01-31 13:16:00 -08002879}
2880
David S. Miller14e50e52007-05-24 18:17:54 -07002881static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2882{
2883}
2884
Held Bernhard0972ddb2011-04-24 22:07:32 +00002885static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2886 unsigned long old)
2887{
2888 return NULL;
2889}
2890
David S. Miller14e50e52007-05-24 18:17:54 -07002891static struct dst_ops ipv4_dst_blackhole_ops = {
2892 .family = AF_INET,
Harvey Harrison09640e62009-02-01 00:45:17 -08002893 .protocol = cpu_to_be16(ETH_P_IP),
David S. Miller14e50e52007-05-24 18:17:54 -07002894 .destroy = ipv4_dst_destroy,
Jianzhao Wangae2688d2010-09-08 14:35:43 -07002895 .check = ipv4_blackhole_dst_check,
Steffen Klassertebb762f2011-11-23 02:12:51 +00002896 .mtu = ipv4_blackhole_mtu,
Eric Dumazet214f45c2011-02-18 11:39:01 -08002897 .default_advmss = ipv4_default_advmss,
David S. Miller14e50e52007-05-24 18:17:54 -07002898 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
Held Bernhard0972ddb2011-04-24 22:07:32 +00002899 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
David S. Millerd3aaeb32011-07-18 00:40:17 -07002900 .neigh_lookup = ipv4_neigh_lookup,
David S. Miller14e50e52007-05-24 18:17:54 -07002901};
2902
David S. Miller2774c132011-03-01 14:59:04 -08002903struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
David S. Miller14e50e52007-05-24 18:17:54 -07002904{
David S. Miller5c1e6aa2011-04-28 14:13:38 -07002905 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
David S. Miller2774c132011-03-01 14:59:04 -08002906 struct rtable *ort = (struct rtable *) dst_orig;
David S. Miller14e50e52007-05-24 18:17:54 -07002907
2908 if (rt) {
Changli Gaod8d1f302010-06-10 23:31:35 -07002909 struct dst_entry *new = &rt->dst;
David S. Miller14e50e52007-05-24 18:17:54 -07002910
David S. Miller14e50e52007-05-24 18:17:54 -07002911 new->__use = 1;
Herbert Xu352e5122007-11-13 21:34:06 -08002912 new->input = dst_discard;
2913 new->output = dst_discard;
David S. Millerdefb3512010-12-08 21:16:57 -08002914 dst_copy_metrics(new, &ort->dst);
David S. Miller14e50e52007-05-24 18:17:54 -07002915
Changli Gaod8d1f302010-06-10 23:31:35 -07002916 new->dev = ort->dst.dev;
David S. Miller14e50e52007-05-24 18:17:54 -07002917 if (new->dev)
2918 dev_hold(new->dev);
2919
David S. Miller5e2b61f2011-03-04 21:47:09 -08002920 rt->rt_key_dst = ort->rt_key_dst;
2921 rt->rt_key_src = ort->rt_key_src;
David S. Miller475949d2011-05-03 19:45:15 -07002922 rt->rt_key_tos = ort->rt_key_tos;
OGAWA Hirofumi1b86a582011-04-07 14:04:08 -07002923 rt->rt_route_iif = ort->rt_route_iif;
David S. Miller5e2b61f2011-03-04 21:47:09 -08002924 rt->rt_iif = ort->rt_iif;
2925 rt->rt_oif = ort->rt_oif;
2926 rt->rt_mark = ort->rt_mark;
David S. Miller14e50e52007-05-24 18:17:54 -07002927
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002928 rt->rt_genid = rt_genid(net);
David S. Miller14e50e52007-05-24 18:17:54 -07002929 rt->rt_flags = ort->rt_flags;
2930 rt->rt_type = ort->rt_type;
2931 rt->rt_dst = ort->rt_dst;
2932 rt->rt_src = ort->rt_src;
David S. Miller14e50e52007-05-24 18:17:54 -07002933 rt->rt_gateway = ort->rt_gateway;
2934 rt->rt_spec_dst = ort->rt_spec_dst;
2935 rt->peer = ort->peer;
2936 if (rt->peer)
2937 atomic_inc(&rt->peer->refcnt);
David S. Miller62fa8a82011-01-26 20:51:05 -08002938 rt->fi = ort->fi;
2939 if (rt->fi)
2940 atomic_inc(&rt->fi->fib_clntref);
David S. Miller14e50e52007-05-24 18:17:54 -07002941
2942 dst_free(new);
2943 }
2944
David S. Miller2774c132011-03-01 14:59:04 -08002945 dst_release(dst_orig);
2946
2947 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
David S. Miller14e50e52007-05-24 18:17:54 -07002948}
2949
David S. Miller9d6ec932011-03-12 01:12:47 -05002950struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
David S. Millerb23dd4f2011-03-02 14:31:35 -08002951 struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002952{
David S. Miller9d6ec932011-03-12 01:12:47 -05002953 struct rtable *rt = __ip_route_output_key(net, flp4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002954
David S. Millerb23dd4f2011-03-02 14:31:35 -08002955 if (IS_ERR(rt))
2956 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002957
David S. Miller56157872011-05-02 14:37:45 -07002958 if (flp4->flowi4_proto)
David S. Miller9d6ec932011-03-12 01:12:47 -05002959 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2960 flowi4_to_flowi(flp4),
2961 sk, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002962
David S. Millerb23dd4f2011-03-02 14:31:35 -08002963 return rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002964}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002965EXPORT_SYMBOL_GPL(ip_route_output_flow);
2966
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002967static int rt_fill_info(struct net *net,
2968 struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002969 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002970{
Eric Dumazet511c3f92009-06-02 05:14:27 +00002971 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002972 struct rtmsg *r;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002973 struct nlmsghdr *nlh;
Steffen Klassert2bc8ca42011-10-11 01:12:02 +00002974 unsigned long expires = 0;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00002975 const struct inet_peer *peer = rt->peer;
Thomas Grafe3703b32006-11-27 09:27:07 -08002976 u32 id = 0, ts = 0, tsage = 0, error;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002977
2978 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2979 if (nlh == NULL)
Patrick McHardy26932562007-01-31 23:16:40 -08002980 return -EMSGSIZE;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002981
2982 r = nlmsg_data(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002983 r->rtm_family = AF_INET;
2984 r->rtm_dst_len = 32;
2985 r->rtm_src_len = 0;
David S. Miller475949d2011-05-03 19:45:15 -07002986 r->rtm_tos = rt->rt_key_tos;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002987 r->rtm_table = RT_TABLE_MAIN;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002988 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002989 r->rtm_type = rt->rt_type;
2990 r->rtm_scope = RT_SCOPE_UNIVERSE;
2991 r->rtm_protocol = RTPROT_UNSPEC;
2992 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2993 if (rt->rt_flags & RTCF_NOTIFY)
2994 r->rtm_flags |= RTM_F_NOTIFY;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002995
Al Viro17fb2c62006-09-26 22:15:25 -07002996 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002997
David S. Miller5e2b61f2011-03-04 21:47:09 -08002998 if (rt->rt_key_src) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002999 r->rtm_src_len = 32;
David S. Miller5e2b61f2011-03-04 21:47:09 -08003000 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003001 }
Changli Gaod8d1f302010-06-10 23:31:35 -07003002 if (rt->dst.dev)
3003 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
Patrick McHardyc7066f72011-01-14 13:36:42 +01003004#ifdef CONFIG_IP_ROUTE_CLASSID
Changli Gaod8d1f302010-06-10 23:31:35 -07003005 if (rt->dst.tclassid)
3006 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003007#endif
David S. Millerc7537962010-11-11 17:07:48 -08003008 if (rt_is_input_route(rt))
Al Viro17fb2c62006-09-26 22:15:25 -07003009 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
David S. Miller5e2b61f2011-03-04 21:47:09 -08003010 else if (rt->rt_src != rt->rt_key_src)
Al Viro17fb2c62006-09-26 22:15:25 -07003011 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
Thomas Grafbe403ea2006-08-17 18:15:17 -07003012
Linus Torvalds1da177e2005-04-16 15:20:36 -07003013 if (rt->rt_dst != rt->rt_gateway)
Al Viro17fb2c62006-09-26 22:15:25 -07003014 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
Thomas Grafbe403ea2006-08-17 18:15:17 -07003015
David S. Millerdefb3512010-12-08 21:16:57 -08003016 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
Thomas Grafbe403ea2006-08-17 18:15:17 -07003017 goto nla_put_failure;
3018
David S. Miller5e2b61f2011-03-04 21:47:09 -08003019 if (rt->rt_mark)
3020 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
Eric Dumazet963bfee2010-07-20 22:03:14 +00003021
Changli Gaod8d1f302010-06-10 23:31:35 -07003022 error = rt->dst.error;
Eric Dumazetfe6fe792011-06-08 06:07:07 +00003023 if (peer) {
Eric Dumazet317fe0e2010-06-16 04:52:13 +00003024 inet_peer_refcheck(rt->peer);
Eric Dumazetfe6fe792011-06-08 06:07:07 +00003025 id = atomic_read(&peer->ip_id_count) & 0xffff;
3026 if (peer->tcp_ts_stamp) {
3027 ts = peer->tcp_ts;
3028 tsage = get_seconds() - peer->tcp_ts_stamp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003029 }
Eric Dumazetfe6fe792011-06-08 06:07:07 +00003030 expires = ACCESS_ONCE(peer->pmtu_expires);
Steffen Klassert2bc8ca42011-10-11 01:12:02 +00003031 if (expires) {
3032 if (time_before(jiffies, expires))
3033 expires -= jiffies;
3034 else
3035 expires = 0;
3036 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003037 }
Thomas Grafbe403ea2006-08-17 18:15:17 -07003038
David S. Millerc7537962010-11-11 17:07:48 -08003039 if (rt_is_input_route(rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003040#ifdef CONFIG_IP_MROUTE
Al Viroe4485152006-09-26 22:15:01 -07003041 __be32 dst = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003042
Joe Perchesf97c1e02007-12-16 13:45:43 -08003043 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003044 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
David S. Miller9a1b9492011-05-04 12:18:54 -07003045 int err = ipmr_get_route(net, skb,
3046 rt->rt_src, rt->rt_dst,
3047 r, nowait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003048 if (err <= 0) {
3049 if (!nowait) {
3050 if (err == 0)
3051 return 0;
Thomas Grafbe403ea2006-08-17 18:15:17 -07003052 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003053 } else {
3054 if (err == -EMSGSIZE)
Thomas Grafbe403ea2006-08-17 18:15:17 -07003055 goto nla_put_failure;
Thomas Grafe3703b32006-11-27 09:27:07 -08003056 error = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003057 }
3058 }
3059 } else
3060#endif
David S. Miller5e2b61f2011-03-04 21:47:09 -08003061 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003062 }
3063
Changli Gaod8d1f302010-06-10 23:31:35 -07003064 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
Thomas Grafe3703b32006-11-27 09:27:07 -08003065 expires, error) < 0)
3066 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003067
Thomas Grafbe403ea2006-08-17 18:15:17 -07003068 return nlmsg_end(skb, nlh);
3069
3070nla_put_failure:
Patrick McHardy26932562007-01-31 23:16:40 -08003071 nlmsg_cancel(skb, nlh);
3072 return -EMSGSIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003073}
3074
Thomas Graf63f34442007-03-22 11:55:17 -07003075static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003076{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09003077 struct net *net = sock_net(in_skb->sk);
Thomas Grafd889ce32006-08-17 18:15:44 -07003078 struct rtmsg *rtm;
3079 struct nlattr *tb[RTA_MAX+1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07003080 struct rtable *rt = NULL;
Al Viro9e12bb22006-09-26 21:25:20 -07003081 __be32 dst = 0;
3082 __be32 src = 0;
3083 u32 iif;
Thomas Grafd889ce32006-08-17 18:15:44 -07003084 int err;
Eric Dumazet963bfee2010-07-20 22:03:14 +00003085 int mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003086 struct sk_buff *skb;
3087
Thomas Grafd889ce32006-08-17 18:15:44 -07003088 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3089 if (err < 0)
3090 goto errout;
3091
3092 rtm = nlmsg_data(nlh);
3093
Linus Torvalds1da177e2005-04-16 15:20:36 -07003094 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
Thomas Grafd889ce32006-08-17 18:15:44 -07003095 if (skb == NULL) {
3096 err = -ENOBUFS;
3097 goto errout;
3098 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003099
3100 /* Reserve room for dummy headers, this skb can pass
3101 through good chunk of routing engine.
3102 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07003103 skb_reset_mac_header(skb);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07003104 skb_reset_network_header(skb);
Stephen Hemmingerd2c962b2006-04-17 17:27:11 -07003105
3106 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07003107 ip_hdr(skb)->protocol = IPPROTO_ICMP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003108 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3109
Al Viro17fb2c62006-09-26 22:15:25 -07003110 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3111 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
Thomas Grafd889ce32006-08-17 18:15:44 -07003112 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
Eric Dumazet963bfee2010-07-20 22:03:14 +00003113 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003114
3115 if (iif) {
Thomas Grafd889ce32006-08-17 18:15:44 -07003116 struct net_device *dev;
3117
Denis V. Lunev19375042008-02-28 20:52:04 -08003118 dev = __dev_get_by_index(net, iif);
Thomas Grafd889ce32006-08-17 18:15:44 -07003119 if (dev == NULL) {
3120 err = -ENODEV;
3121 goto errout_free;
3122 }
3123
Linus Torvalds1da177e2005-04-16 15:20:36 -07003124 skb->protocol = htons(ETH_P_IP);
3125 skb->dev = dev;
Eric Dumazet963bfee2010-07-20 22:03:14 +00003126 skb->mark = mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003127 local_bh_disable();
3128 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3129 local_bh_enable();
Thomas Grafd889ce32006-08-17 18:15:44 -07003130
Eric Dumazet511c3f92009-06-02 05:14:27 +00003131 rt = skb_rtable(skb);
Changli Gaod8d1f302010-06-10 23:31:35 -07003132 if (err == 0 && rt->dst.error)
3133 err = -rt->dst.error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003134 } else {
David S. Miller68a5e3d2011-03-11 20:07:33 -05003135 struct flowi4 fl4 = {
3136 .daddr = dst,
3137 .saddr = src,
3138 .flowi4_tos = rtm->rtm_tos,
3139 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3140 .flowi4_mark = mark,
Thomas Grafd889ce32006-08-17 18:15:44 -07003141 };
David S. Miller9d6ec932011-03-12 01:12:47 -05003142 rt = ip_route_output_key(net, &fl4);
David S. Millerb23dd4f2011-03-02 14:31:35 -08003143
3144 err = 0;
3145 if (IS_ERR(rt))
3146 err = PTR_ERR(rt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003147 }
Thomas Grafd889ce32006-08-17 18:15:44 -07003148
Linus Torvalds1da177e2005-04-16 15:20:36 -07003149 if (err)
Thomas Grafd889ce32006-08-17 18:15:44 -07003150 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003151
Changli Gaod8d1f302010-06-10 23:31:35 -07003152 skb_dst_set(skb, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003153 if (rtm->rtm_flags & RTM_F_NOTIFY)
3154 rt->rt_flags |= RTCF_NOTIFY;
3155
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003156 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Denis V. Lunev19375042008-02-28 20:52:04 -08003157 RTM_NEWROUTE, 0, 0);
Thomas Grafd889ce32006-08-17 18:15:44 -07003158 if (err <= 0)
3159 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003160
Denis V. Lunev19375042008-02-28 20:52:04 -08003161 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
Thomas Grafd889ce32006-08-17 18:15:44 -07003162errout:
Thomas Graf2942e902006-08-15 00:30:25 -07003163 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003164
Thomas Grafd889ce32006-08-17 18:15:44 -07003165errout_free:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003166 kfree_skb(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07003167 goto errout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003168}
3169
3170int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
3171{
3172 struct rtable *rt;
3173 int h, s_h;
3174 int idx, s_idx;
Denis V. Lunev19375042008-02-28 20:52:04 -08003175 struct net *net;
3176
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09003177 net = sock_net(skb->sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003178
3179 s_h = cb->args[0];
Eric Dumazetd8c92832008-01-07 21:52:14 -08003180 if (s_h < 0)
3181 s_h = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003182 s_idx = idx = cb->args[1];
Eric Dumazeta6272662008-08-28 01:11:25 -07003183 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3184 if (!rt_hash_table[h].chain)
3185 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003186 rcu_read_lock_bh();
Paul E. McKenneya898def2010-02-22 17:04:49 -08003187 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
Changli Gaod8d1f302010-06-10 23:31:35 -07003188 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3189 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003190 continue;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07003191 if (rt_is_expired(rt))
Eric Dumazet29e75252008-01-31 17:05:09 -08003192 continue;
Changli Gaod8d1f302010-06-10 23:31:35 -07003193 skb_dst_set_noref(skb, &rt->dst);
Benjamin Thery4feb88e2009-01-22 04:56:23 +00003194 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003195 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07003196 1, NLM_F_MULTI) <= 0) {
Eric Dumazetadf30902009-06-02 05:19:30 +00003197 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003198 rcu_read_unlock_bh();
3199 goto done;
3200 }
Eric Dumazetadf30902009-06-02 05:19:30 +00003201 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003202 }
3203 rcu_read_unlock_bh();
3204 }
3205
3206done:
3207 cb->args[0] = h;
3208 cb->args[1] = idx;
3209 return skb->len;
3210}
3211
3212void ip_rt_multicast_event(struct in_device *in_dev)
3213{
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -07003214 rt_cache_flush(dev_net(in_dev->dev), 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003215}
3216
3217#ifdef CONFIG_SYSCTL
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003218static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003219 void __user *buffer,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003220 size_t *lenp, loff_t *ppos)
3221{
3222 if (write) {
Denis V. Lunev639e1042008-07-05 19:02:06 -07003223 int flush_delay;
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003224 ctl_table ctl;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003225 struct net *net;
Denis V. Lunev639e1042008-07-05 19:02:06 -07003226
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003227 memcpy(&ctl, __ctl, sizeof(ctl));
3228 ctl.data = &flush_delay;
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07003229 proc_dointvec(&ctl, write, buffer, lenp, ppos);
Denis V. Lunev639e1042008-07-05 19:02:06 -07003230
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003231 net = (struct net *)__ctl->extra1;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003232 rt_cache_flush(net, flush_delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003233 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003234 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003235
3236 return -EINVAL;
3237}
3238
Al Viroeeb61f72008-07-27 08:59:33 +01003239static ctl_table ipv4_route_table[] = {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003240 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003241 .procname = "gc_thresh",
3242 .data = &ipv4_dst_ops.gc_thresh,
3243 .maxlen = sizeof(int),
3244 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003245 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003246 },
3247 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003248 .procname = "max_size",
3249 .data = &ip_rt_max_size,
3250 .maxlen = sizeof(int),
3251 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003252 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003253 },
3254 {
3255 /* Deprecated. Use gc_min_interval_ms */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003256
Linus Torvalds1da177e2005-04-16 15:20:36 -07003257 .procname = "gc_min_interval",
3258 .data = &ip_rt_gc_min_interval,
3259 .maxlen = sizeof(int),
3260 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003261 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003262 },
3263 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003264 .procname = "gc_min_interval_ms",
3265 .data = &ip_rt_gc_min_interval,
3266 .maxlen = sizeof(int),
3267 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003268 .proc_handler = proc_dointvec_ms_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003269 },
3270 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003271 .procname = "gc_timeout",
3272 .data = &ip_rt_gc_timeout,
3273 .maxlen = sizeof(int),
3274 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003275 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003276 },
3277 {
Eric Dumazet9f28a2f2011-12-21 15:47:16 -05003278 .procname = "gc_interval",
3279 .data = &ip_rt_gc_interval,
3280 .maxlen = sizeof(int),
3281 .mode = 0644,
3282 .proc_handler = proc_dointvec_jiffies,
3283 },
3284 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003285 .procname = "redirect_load",
3286 .data = &ip_rt_redirect_load,
3287 .maxlen = sizeof(int),
3288 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003289 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003290 },
3291 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003292 .procname = "redirect_number",
3293 .data = &ip_rt_redirect_number,
3294 .maxlen = sizeof(int),
3295 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003296 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003297 },
3298 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003299 .procname = "redirect_silence",
3300 .data = &ip_rt_redirect_silence,
3301 .maxlen = sizeof(int),
3302 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003303 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003304 },
3305 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003306 .procname = "error_cost",
3307 .data = &ip_rt_error_cost,
3308 .maxlen = sizeof(int),
3309 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003310 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003311 },
3312 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003313 .procname = "error_burst",
3314 .data = &ip_rt_error_burst,
3315 .maxlen = sizeof(int),
3316 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003317 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003318 },
3319 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003320 .procname = "gc_elasticity",
3321 .data = &ip_rt_gc_elasticity,
3322 .maxlen = sizeof(int),
3323 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003324 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003325 },
3326 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003327 .procname = "mtu_expires",
3328 .data = &ip_rt_mtu_expires,
3329 .maxlen = sizeof(int),
3330 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003331 .proc_handler = proc_dointvec_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003332 },
3333 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003334 .procname = "min_pmtu",
3335 .data = &ip_rt_min_pmtu,
3336 .maxlen = sizeof(int),
3337 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003338 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003339 },
3340 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003341 .procname = "min_adv_mss",
3342 .data = &ip_rt_min_advmss,
3343 .maxlen = sizeof(int),
3344 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003345 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003346 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003347 { }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003348};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003349
Al Viro2f4520d2008-08-25 15:17:44 -07003350static struct ctl_table empty[1];
3351
3352static struct ctl_table ipv4_skeleton[] =
3353{
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003354 { .procname = "route",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003355 .mode = 0555, .child = ipv4_route_table},
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003356 { .procname = "neigh",
Hugh Dickinsd994af02008-08-27 02:35:18 -07003357 .mode = 0555, .child = empty},
Al Viro2f4520d2008-08-25 15:17:44 -07003358 { }
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003359};
3360
Al Viro2f4520d2008-08-25 15:17:44 -07003361static __net_initdata struct ctl_path ipv4_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003362 { .procname = "net", },
3363 { .procname = "ipv4", },
Al Viro2f4520d2008-08-25 15:17:44 -07003364 { },
3365};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003366
3367static struct ctl_table ipv4_route_flush_table[] = {
3368 {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003369 .procname = "flush",
3370 .maxlen = sizeof(int),
3371 .mode = 0200,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003372 .proc_handler = ipv4_sysctl_rtcache_flush,
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003373 },
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003374 { },
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003375};
3376
Al Viro2f4520d2008-08-25 15:17:44 -07003377static __net_initdata struct ctl_path ipv4_route_path[] = {
Eric W. Biedermanf8572d82009-11-05 13:32:03 -08003378 { .procname = "net", },
3379 { .procname = "ipv4", },
3380 { .procname = "route", },
Al Viro2f4520d2008-08-25 15:17:44 -07003381 { },
3382};
3383
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003384static __net_init int sysctl_route_net_init(struct net *net)
3385{
3386 struct ctl_table *tbl;
3387
3388 tbl = ipv4_route_flush_table;
Octavian Purdila09ad9bc2009-11-25 15:14:13 -08003389 if (!net_eq(net, &init_net)) {
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003390 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3391 if (tbl == NULL)
3392 goto err_dup;
3393 }
3394 tbl[0].extra1 = net;
3395
3396 net->ipv4.route_hdr =
3397 register_net_sysctl_table(net, ipv4_route_path, tbl);
3398 if (net->ipv4.route_hdr == NULL)
3399 goto err_reg;
3400 return 0;
3401
3402err_reg:
3403 if (tbl != ipv4_route_flush_table)
3404 kfree(tbl);
3405err_dup:
3406 return -ENOMEM;
3407}
3408
3409static __net_exit void sysctl_route_net_exit(struct net *net)
3410{
3411 struct ctl_table *tbl;
3412
3413 tbl = net->ipv4.route_hdr->ctl_table_arg;
3414 unregister_net_sysctl_table(net->ipv4.route_hdr);
3415 BUG_ON(tbl == ipv4_route_flush_table);
3416 kfree(tbl);
3417}
3418
3419static __net_initdata struct pernet_operations sysctl_route_ops = {
3420 .init = sysctl_route_net_init,
3421 .exit = sysctl_route_net_exit,
3422};
Linus Torvalds1da177e2005-04-16 15:20:36 -07003423#endif
3424
Neil Horman3ee94372010-05-08 01:57:52 -07003425static __net_init int rt_genid_init(struct net *net)
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003426{
Neil Horman3ee94372010-05-08 01:57:52 -07003427 get_random_bytes(&net->ipv4.rt_genid,
3428 sizeof(net->ipv4.rt_genid));
David S. Miller436c3b62011-03-24 17:42:21 -07003429 get_random_bytes(&net->ipv4.dev_addr_genid,
3430 sizeof(net->ipv4.dev_addr_genid));
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003431 return 0;
3432}
3433
Neil Horman3ee94372010-05-08 01:57:52 -07003434static __net_initdata struct pernet_operations rt_genid_ops = {
3435 .init = rt_genid_init,
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003436};
3437
3438
Patrick McHardyc7066f72011-01-14 13:36:42 +01003439#ifdef CONFIG_IP_ROUTE_CLASSID
Tejun Heo7d720c32010-02-16 15:20:26 +00003440struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
Patrick McHardyc7066f72011-01-14 13:36:42 +01003441#endif /* CONFIG_IP_ROUTE_CLASSID */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003442
3443static __initdata unsigned long rhash_entries;
3444static int __init set_rhash_entries(char *str)
3445{
3446 if (!str)
3447 return 0;
3448 rhash_entries = simple_strtoul(str, &str, 0);
3449 return 1;
3450}
3451__setup("rhash_entries=", set_rhash_entries);
3452
3453int __init ip_rt_init(void)
3454{
Eric Dumazet424c4b72005-07-05 14:58:19 -07003455 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003456
Patrick McHardyc7066f72011-01-14 13:36:42 +01003457#ifdef CONFIG_IP_ROUTE_CLASSID
Ingo Molnar0dcec8c2009-02-25 14:07:33 +01003458 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003459 if (!ip_rt_acct)
3460 panic("IP: failed to allocate ip_rt_acct\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003461#endif
3462
Alexey Dobriyane5d679f2006-08-26 19:25:52 -07003463 ipv4_dst_ops.kmem_cachep =
3464 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09003465 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003466
David S. Miller14e50e52007-05-24 18:17:54 -07003467 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3468
Eric Dumazetfc66f952010-10-08 06:37:34 +00003469 if (dst_entries_init(&ipv4_dst_ops) < 0)
3470 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3471
3472 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3473 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3474
Eric Dumazet424c4b72005-07-05 14:58:19 -07003475 rt_hash_table = (struct rt_hash_bucket *)
3476 alloc_large_system_hash("IP route cache",
3477 sizeof(struct rt_hash_bucket),
3478 rhash_entries,
Jan Beulich44813742009-09-21 17:03:05 -07003479 (totalram_pages >= 128 * 1024) ?
Mike Stroyan18955cf2005-11-29 16:12:55 -08003480 15 : 17,
Kirill Korotaev8d1502d2006-08-07 20:44:22 -07003481 0,
Eric Dumazet424c4b72005-07-05 14:58:19 -07003482 &rt_hash_log,
3483 &rt_hash_mask,
Anton Blanchardc9503e02009-04-27 05:42:24 -07003484 rhash_entries ? 0 : 512 * 1024);
Eric Dumazet22c047c2005-07-05 14:55:24 -07003485 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3486 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003487
3488 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3489 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3490
Linus Torvalds1da177e2005-04-16 15:20:36 -07003491 devinet_init();
3492 ip_fib_init();
3493
Eric Dumazet9f28a2f2011-12-21 15:47:16 -05003494 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3495 expires_ljiffies = jiffies;
3496 schedule_delayed_work(&expires_work,
3497 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3498
Denis V. Lunev73b38712008-02-28 20:51:18 -08003499 if (ip_rt_proc_init())
Pavel Emelyanov107f1632007-12-05 21:14:28 -08003500 printk(KERN_ERR "Unable to create route proc files\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003501#ifdef CONFIG_XFRM
3502 xfrm_init();
Neil Hormana33bc5c2009-07-30 18:52:15 -07003503 xfrm4_init(ip_rt_max_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003504#endif
Greg Rosec7ac8672011-06-10 01:27:09 +00003505 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
Thomas Graf63f34442007-03-22 11:55:17 -07003506
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003507#ifdef CONFIG_SYSCTL
3508 register_pernet_subsys(&sysctl_route_ops);
3509#endif
Neil Horman3ee94372010-05-08 01:57:52 -07003510 register_pernet_subsys(&rt_genid_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003511 return rc;
3512}
3513
Al Viroa1bc6eb2008-07-30 06:32:52 -04003514#ifdef CONFIG_SYSCTL
Al Viroeeb61f72008-07-27 08:59:33 +01003515/*
3516 * We really need to sanitize the damn ipv4 init order, then all
3517 * this nonsense will go away.
3518 */
3519void __init ip_static_sysctl_init(void)
3520{
Al Viro2f4520d2008-08-25 15:17:44 -07003521 register_sysctl_paths(ipv4_path, ipv4_skeleton);
Al Viroeeb61f72008-07-27 08:59:33 +01003522}
Al Viroa1bc6eb2008-07-30 06:32:52 -04003523#endif