blob: a849bb15d86485f6ae5d98358eaa041e3edd3011 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
Jesper Juhl02c30a82005-05-05 16:16:16 -07008 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -07009 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090021 * Alan Cox : Super /proc >4K
Linus Torvalds1da177e2005-04-16 15:20:36 -070022 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090039 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070055 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Ilia Sotnikovcef26852006-03-25 01:38:55 -080056 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
Linus Torvalds1da177e2005-04-16 15:20:36 -070058 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
Linus Torvalds1da177e2005-04-16 15:20:36 -070065#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070071#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070072#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070073#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
Eric Dumazet39c90ec2007-09-15 10:55:54 -070082#include <linux/workqueue.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070083#include <linux/skbuff.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070084#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
Herbert Xu352e5122007-11-13 21:34:06 -080093#include <net/dst.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020094#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070095#include <net/protocol.h>
96#include <net/ip.h>
97#include <net/route.h>
98#include <net/inetpeer.h>
99#include <net/sock.h>
100#include <net/ip_fib.h>
101#include <net/arp.h>
102#include <net/tcp.h>
103#include <net/icmp.h>
104#include <net/xfrm.h>
Tom Tucker8d717402006-07-30 20:43:36 -0700105#include <net/netevent.h>
Thomas Graf63f34442007-03-22 11:55:17 -0700106#include <net/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700107#ifdef CONFIG_SYSCTL
108#include <linux/sysctl.h>
109#endif
110
111#define RT_FL_TOS(oldflp) \
112 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113
114#define IP_MAX_MTU 0xFFF0
115
116#define RT_GC_TIMEOUT (300*HZ)
117
Linus Torvalds1da177e2005-04-16 15:20:36 -0700118static int ip_rt_max_size;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700119static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
120static int ip_rt_gc_interval __read_mostly = 60 * HZ;
121static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
122static int ip_rt_redirect_number __read_mostly = 9;
123static int ip_rt_redirect_load __read_mostly = HZ / 50;
124static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
125static int ip_rt_error_cost __read_mostly = HZ;
126static int ip_rt_error_burst __read_mostly = 5 * HZ;
127static int ip_rt_gc_elasticity __read_mostly = 8;
128static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
129static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
130static int ip_rt_min_advmss __read_mostly = 256;
131static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
Neil Horman1080d702008-10-27 12:28:25 -0700132static int rt_chain_length_max __read_mostly = 20;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700133
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800134static void rt_worker_func(struct work_struct *work);
135static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700136
137/*
138 * Interface to generic destination cache.
139 */
140
141static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142static void ipv4_dst_destroy(struct dst_entry *dst);
143static void ipv4_dst_ifdown(struct dst_entry *dst,
144 struct net_device *dev, int how);
145static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146static void ipv4_link_failure(struct sk_buff *skb);
147static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
Daniel Lezcano569d3642008-01-18 03:56:57 -0800148static int rt_garbage_collect(struct dst_ops *ops);
Neil Horman1080d702008-10-27 12:28:25 -0700149static void rt_emergency_hash_rebuild(struct net *net);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700150
151
152static struct dst_ops ipv4_dst_ops = {
153 .family = AF_INET,
Harvey Harrison09640e62009-02-01 00:45:17 -0800154 .protocol = cpu_to_be16(ETH_P_IP),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700155 .gc = rt_garbage_collect,
156 .check = ipv4_dst_check,
157 .destroy = ipv4_dst_destroy,
158 .ifdown = ipv4_dst_ifdown,
159 .negative_advice = ipv4_negative_advice,
160 .link_failure = ipv4_link_failure,
161 .update_pmtu = ip_rt_update_pmtu,
Herbert Xu1ac06e02008-05-20 14:32:14 -0700162 .local_out = __ip_local_out,
Eric Dumazete2422972008-01-30 20:07:45 -0800163 .entries = ATOMIC_INIT(0),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700164};
165
166#define ECN_OR_COST(class) TC_PRIO_##class
167
Philippe De Muyter4839c522007-07-09 15:32:57 -0700168const __u8 ip_tos2prio[16] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700169 TC_PRIO_BESTEFFORT,
170 ECN_OR_COST(FILLER),
171 TC_PRIO_BESTEFFORT,
172 ECN_OR_COST(BESTEFFORT),
173 TC_PRIO_BULK,
174 ECN_OR_COST(BULK),
175 TC_PRIO_BULK,
176 ECN_OR_COST(BULK),
177 TC_PRIO_INTERACTIVE,
178 ECN_OR_COST(INTERACTIVE),
179 TC_PRIO_INTERACTIVE,
180 ECN_OR_COST(INTERACTIVE),
181 TC_PRIO_INTERACTIVE_BULK,
182 ECN_OR_COST(INTERACTIVE_BULK),
183 TC_PRIO_INTERACTIVE_BULK,
184 ECN_OR_COST(INTERACTIVE_BULK)
185};
186
187
188/*
189 * Route cache.
190 */
191
192/* The locking scheme is rather straight forward:
193 *
194 * 1) Read-Copy Update protects the buckets of the central route hash.
195 * 2) Only writers remove entries, and they hold the lock
196 * as they look at rtable reference counts.
197 * 3) Only readers acquire references to rtable entries,
198 * they do so with atomic increments and with the
199 * lock held.
200 */
201
202struct rt_hash_bucket {
203 struct rtable *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700204};
Neil Horman1080d702008-10-27 12:28:25 -0700205
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700206#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
207 defined(CONFIG_PROVE_LOCKING)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700208/*
209 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210 * The size of this table is a power of two and depends on the number of CPUS.
Ingo Molnar62051202006-07-03 00:24:59 -0700211 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700212 */
Ingo Molnar62051202006-07-03 00:24:59 -0700213#ifdef CONFIG_LOCKDEP
214# define RT_HASH_LOCK_SZ 256
Eric Dumazet22c047c2005-07-05 14:55:24 -0700215#else
Ingo Molnar62051202006-07-03 00:24:59 -0700216# if NR_CPUS >= 32
217# define RT_HASH_LOCK_SZ 4096
218# elif NR_CPUS >= 16
219# define RT_HASH_LOCK_SZ 2048
220# elif NR_CPUS >= 8
221# define RT_HASH_LOCK_SZ 1024
222# elif NR_CPUS >= 4
223# define RT_HASH_LOCK_SZ 512
224# else
225# define RT_HASH_LOCK_SZ 256
226# endif
Eric Dumazet22c047c2005-07-05 14:55:24 -0700227#endif
228
229static spinlock_t *rt_hash_locks;
230# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800231
232static __init void rt_hash_lock_init(void)
233{
234 int i;
235
236 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
237 GFP_KERNEL);
238 if (!rt_hash_locks)
239 panic("IP: failed to allocate rt_hash_locks\n");
240
241 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
242 spin_lock_init(&rt_hash_locks[i]);
243}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700244#else
245# define rt_hash_lock_addr(slot) NULL
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800246
247static inline void rt_hash_lock_init(void)
248{
249}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700250#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700251
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700252static struct rt_hash_bucket *rt_hash_table __read_mostly;
253static unsigned rt_hash_mask __read_mostly;
254static unsigned int rt_hash_log __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700255
Eric Dumazet2f970d82006-01-17 02:54:36 -0800256static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
Andrew Mortondbd29152006-01-17 21:58:01 -0800257#define RT_CACHE_STAT_INC(field) \
Paul Mackerrasbfe5d832006-06-25 05:47:14 -0700258 (__raw_get_cpu_var(rt_cache_stat).field++)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700259
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700260static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
261 int genid)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700262{
Stephen Hemminger1294fc42008-04-10 01:54:01 -0700263 return jhash_3words((__force u32)(__be32)(daddr),
264 (__force u32)(__be32)(saddr),
Denis V. Lunevb00180d2008-07-05 19:04:09 -0700265 idx, genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800266 & rt_hash_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700267}
268
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700269static inline int rt_genid(struct net *net)
270{
271 return atomic_read(&net->ipv4.rt_genid);
272}
273
Linus Torvalds1da177e2005-04-16 15:20:36 -0700274#ifdef CONFIG_PROC_FS
275struct rt_cache_iter_state {
Denis V. Luneva75e9362008-02-28 20:50:55 -0800276 struct seq_net_private p;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700277 int bucket;
Eric Dumazet29e75252008-01-31 17:05:09 -0800278 int genid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700279};
280
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900281static struct rtable *rt_cache_get_first(struct seq_file *seq)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700282{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900283 struct rt_cache_iter_state *st = seq->private;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700284 struct rtable *r = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700285
286 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
Eric Dumazeta6272662008-08-28 01:11:25 -0700287 if (!rt_hash_table[st->bucket].chain)
288 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700289 rcu_read_lock_bh();
Eric Dumazet29e75252008-01-31 17:05:09 -0800290 r = rcu_dereference(rt_hash_table[st->bucket].chain);
291 while (r) {
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900292 if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
Denis V. Luneva75e9362008-02-28 20:50:55 -0800293 r->rt_genid == st->genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800294 return r;
295 r = rcu_dereference(r->u.dst.rt_next);
296 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700297 rcu_read_unlock_bh();
298 }
Eric Dumazet29e75252008-01-31 17:05:09 -0800299 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700300}
301
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900302static struct rtable *__rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800303 struct rtable *r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700304{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900305 struct rt_cache_iter_state *st = seq->private;
Eric Dumazeta6272662008-08-28 01:11:25 -0700306
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800307 r = r->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700308 while (!r) {
309 rcu_read_unlock_bh();
Eric Dumazeta6272662008-08-28 01:11:25 -0700310 do {
311 if (--st->bucket < 0)
312 return NULL;
313 } while (!rt_hash_table[st->bucket].chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700314 rcu_read_lock_bh();
315 r = rt_hash_table[st->bucket].chain;
316 }
Eric Dumazet0bccead2008-01-10 03:55:57 -0800317 return rcu_dereference(r);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700318}
319
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900320static struct rtable *rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800321 struct rtable *r)
322{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900323 struct rt_cache_iter_state *st = seq->private;
324 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
325 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
Denis V. Luneva75e9362008-02-28 20:50:55 -0800326 continue;
Denis V. Lunev642d6312008-02-28 20:50:33 -0800327 if (r->rt_genid == st->genid)
328 break;
329 }
330 return r;
331}
332
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900333static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700334{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900335 struct rtable *r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700336
337 if (r)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900338 while (pos && (r = rt_cache_get_next(seq, r)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700339 --pos;
340 return pos ? NULL : r;
341}
342
343static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
344{
Eric Dumazet29e75252008-01-31 17:05:09 -0800345 struct rt_cache_iter_state *st = seq->private;
Eric Dumazet29e75252008-01-31 17:05:09 -0800346 if (*pos)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900347 return rt_cache_get_idx(seq, *pos - 1);
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700348 st->genid = rt_genid(seq_file_net(seq));
Eric Dumazet29e75252008-01-31 17:05:09 -0800349 return SEQ_START_TOKEN;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700350}
351
352static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
353{
Eric Dumazet29e75252008-01-31 17:05:09 -0800354 struct rtable *r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700355
356 if (v == SEQ_START_TOKEN)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900357 r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700358 else
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900359 r = rt_cache_get_next(seq, v);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700360 ++*pos;
361 return r;
362}
363
364static void rt_cache_seq_stop(struct seq_file *seq, void *v)
365{
366 if (v && v != SEQ_START_TOKEN)
367 rcu_read_unlock_bh();
368}
369
370static int rt_cache_seq_show(struct seq_file *seq, void *v)
371{
372 if (v == SEQ_START_TOKEN)
373 seq_printf(seq, "%-127s\n",
374 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
375 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
376 "HHUptod\tSpecDst");
377 else {
378 struct rtable *r = v;
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700379 int len;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700380
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700381 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
382 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700383 r->u.dst.dev ? r->u.dst.dev->name : "*",
384 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
385 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
386 r->u.dst.__use, 0, (unsigned long)r->rt_src,
387 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
388 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
389 dst_metric(&r->u.dst, RTAX_WINDOW),
390 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
391 dst_metric(&r->u.dst, RTAX_RTTVAR)),
392 r->fl.fl4_tos,
393 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
394 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
395 dev_queue_xmit) : 0,
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700396 r->rt_spec_dst, &len);
397
398 seq_printf(seq, "%*s\n", 127 - len, "");
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900399 }
400 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700401}
402
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700403static const struct seq_operations rt_cache_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700404 .start = rt_cache_seq_start,
405 .next = rt_cache_seq_next,
406 .stop = rt_cache_seq_stop,
407 .show = rt_cache_seq_show,
408};
409
410static int rt_cache_seq_open(struct inode *inode, struct file *file)
411{
Denis V. Luneva75e9362008-02-28 20:50:55 -0800412 return seq_open_net(inode, file, &rt_cache_seq_ops,
Pavel Emelyanovcf7732e2007-10-10 02:29:29 -0700413 sizeof(struct rt_cache_iter_state));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700414}
415
Arjan van de Ven9a321442007-02-12 00:55:35 -0800416static const struct file_operations rt_cache_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700417 .owner = THIS_MODULE,
418 .open = rt_cache_seq_open,
419 .read = seq_read,
420 .llseek = seq_lseek,
Denis V. Luneva75e9362008-02-28 20:50:55 -0800421 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700422};
423
424
425static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
426{
427 int cpu;
428
429 if (*pos == 0)
430 return SEQ_START_TOKEN;
431
Rusty Russell0f23174a2008-12-29 12:23:42 +0000432 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700433 if (!cpu_possible(cpu))
434 continue;
435 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800436 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700437 }
438 return NULL;
439}
440
441static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
442{
443 int cpu;
444
Rusty Russell0f23174a2008-12-29 12:23:42 +0000445 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700446 if (!cpu_possible(cpu))
447 continue;
448 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800449 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700450 }
451 return NULL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900452
Linus Torvalds1da177e2005-04-16 15:20:36 -0700453}
454
455static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
456{
457
458}
459
460static int rt_cpu_seq_show(struct seq_file *seq, void *v)
461{
462 struct rt_cache_stat *st = v;
463
464 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700465 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700466 return 0;
467 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900468
Linus Torvalds1da177e2005-04-16 15:20:36 -0700469 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
470 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
471 atomic_read(&ipv4_dst_ops.entries),
472 st->in_hit,
473 st->in_slow_tot,
474 st->in_slow_mc,
475 st->in_no_route,
476 st->in_brd,
477 st->in_martian_dst,
478 st->in_martian_src,
479
480 st->out_hit,
481 st->out_slow_tot,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900482 st->out_slow_mc,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700483
484 st->gc_total,
485 st->gc_ignored,
486 st->gc_goal_miss,
487 st->gc_dst_overflow,
488 st->in_hlist_search,
489 st->out_hlist_search
490 );
491 return 0;
492}
493
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700494static const struct seq_operations rt_cpu_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700495 .start = rt_cpu_seq_start,
496 .next = rt_cpu_seq_next,
497 .stop = rt_cpu_seq_stop,
498 .show = rt_cpu_seq_show,
499};
500
501
502static int rt_cpu_seq_open(struct inode *inode, struct file *file)
503{
504 return seq_open(file, &rt_cpu_seq_ops);
505}
506
Arjan van de Ven9a321442007-02-12 00:55:35 -0800507static const struct file_operations rt_cpu_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700508 .owner = THIS_MODULE,
509 .open = rt_cpu_seq_open,
510 .read = seq_read,
511 .llseek = seq_lseek,
512 .release = seq_release,
513};
514
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800515#ifdef CONFIG_NET_CLS_ROUTE
516static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
517 int length, int *eof, void *data)
518{
519 unsigned int i;
520
521 if ((offset & 3) || (length & 3))
522 return -EIO;
523
524 if (offset >= sizeof(struct ip_rt_acct) * 256) {
525 *eof = 1;
526 return 0;
527 }
528
529 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
530 length = sizeof(struct ip_rt_acct) * 256 - offset;
531 *eof = 1;
532 }
533
534 offset /= sizeof(u32);
535
536 if (length > 0) {
537 u32 *dst = (u32 *) buffer;
538
539 *start = buffer;
540 memset(dst, 0, length);
541
542 for_each_possible_cpu(i) {
543 unsigned int j;
544 u32 *src;
545
546 src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
547 for (j = 0; j < length/4; j++)
548 dst[j] += src[j];
549 }
550 }
551 return length;
552}
553#endif
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800554
Denis V. Lunev73b38712008-02-28 20:51:18 -0800555static int __net_init ip_rt_do_proc_init(struct net *net)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800556{
557 struct proc_dir_entry *pde;
558
559 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
560 &rt_cache_seq_fops);
561 if (!pde)
562 goto err1;
563
Wang Chen77020722008-02-28 14:14:25 -0800564 pde = proc_create("rt_cache", S_IRUGO,
565 net->proc_net_stat, &rt_cpu_seq_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800566 if (!pde)
567 goto err2;
568
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800569#ifdef CONFIG_NET_CLS_ROUTE
570 pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
571 ip_rt_acct_read, NULL);
572 if (!pde)
573 goto err3;
574#endif
575 return 0;
576
577#ifdef CONFIG_NET_CLS_ROUTE
578err3:
579 remove_proc_entry("rt_cache", net->proc_net_stat);
580#endif
581err2:
582 remove_proc_entry("rt_cache", net->proc_net);
583err1:
584 return -ENOMEM;
585}
Denis V. Lunev73b38712008-02-28 20:51:18 -0800586
587static void __net_exit ip_rt_do_proc_exit(struct net *net)
588{
589 remove_proc_entry("rt_cache", net->proc_net_stat);
590 remove_proc_entry("rt_cache", net->proc_net);
591 remove_proc_entry("rt_acct", net->proc_net);
592}
593
594static struct pernet_operations ip_rt_proc_ops __net_initdata = {
595 .init = ip_rt_do_proc_init,
596 .exit = ip_rt_do_proc_exit,
597};
598
599static int __init ip_rt_proc_init(void)
600{
601 return register_pernet_subsys(&ip_rt_proc_ops);
602}
603
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800604#else
Denis V. Lunev73b38712008-02-28 20:51:18 -0800605static inline int ip_rt_proc_init(void)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800606{
607 return 0;
608}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700609#endif /* CONFIG_PROC_FS */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900610
Stephen Hemminger5969f712008-04-10 01:52:09 -0700611static inline void rt_free(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700612{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700613 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
614}
615
Stephen Hemminger5969f712008-04-10 01:52:09 -0700616static inline void rt_drop(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700617{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700618 ip_rt_put(rt);
619 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
620}
621
Stephen Hemminger5969f712008-04-10 01:52:09 -0700622static inline int rt_fast_clean(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700623{
624 /* Kill broadcast/multicast entries very aggresively, if they
625 collide in hash table with more useful entries */
626 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800627 rth->fl.iif && rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700628}
629
Stephen Hemminger5969f712008-04-10 01:52:09 -0700630static inline int rt_valuable(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700631{
632 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
633 rth->u.dst.expires;
634}
635
636static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
637{
638 unsigned long age;
639 int ret = 0;
640
641 if (atomic_read(&rth->u.dst.__refcnt))
642 goto out;
643
644 ret = 1;
645 if (rth->u.dst.expires &&
646 time_after_eq(jiffies, rth->u.dst.expires))
647 goto out;
648
649 age = jiffies - rth->u.dst.lastuse;
650 ret = 0;
651 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
652 (age <= tmo2 && rt_valuable(rth)))
653 goto out;
654 ret = 1;
655out: return ret;
656}
657
658/* Bits of score are:
659 * 31: very valuable
660 * 30: not quite useless
661 * 29..0: usage counter
662 */
663static inline u32 rt_score(struct rtable *rt)
664{
665 u32 score = jiffies - rt->u.dst.lastuse;
666
667 score = ~score & ~(3<<30);
668
669 if (rt_valuable(rt))
670 score |= (1<<31);
671
672 if (!rt->fl.iif ||
673 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
674 score |= (1<<30);
675
676 return score;
677}
678
Neil Horman1080d702008-10-27 12:28:25 -0700679static inline bool rt_caching(const struct net *net)
680{
681 return net->ipv4.current_rt_cache_rebuild_count <=
682 net->ipv4.sysctl_rt_cache_rebuild_count;
683}
684
685static inline bool compare_hash_inputs(const struct flowi *fl1,
686 const struct flowi *fl2)
687{
688 return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
689 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
690 (fl1->iif ^ fl2->iif)) == 0);
691}
692
Linus Torvalds1da177e2005-04-16 15:20:36 -0700693static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
694{
Al Viro714e85b2006-11-14 20:51:49 -0800695 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
696 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
Thomas Graf47dcf0c2006-11-09 15:20:38 -0800697 (fl1->mark ^ fl2->mark) |
David S. Miller8238b212006-10-12 00:49:15 -0700698 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
699 *(u16 *)&fl2->nl_u.ip4_u.tos) |
700 (fl1->oif ^ fl2->oif) |
701 (fl1->iif ^ fl2->iif)) == 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700702}
703
Denis V. Lunevb5921912008-01-22 23:50:25 -0800704static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
705{
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900706 return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
Denis V. Lunevb5921912008-01-22 23:50:25 -0800707}
708
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700709static inline int rt_is_expired(struct rtable *rth)
710{
711 return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
712}
713
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800714/*
715 * Perform a full scan of hash table and free all entries.
716 * Can be called by a softirq or a process.
717 * In the later case, we want to be reschedule if necessary
718 */
719static void rt_do_flush(int process_context)
720{
721 unsigned int i;
722 struct rtable *rth, *next;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700723 struct rtable * tail;
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800724
725 for (i = 0; i <= rt_hash_mask; i++) {
726 if (process_context && need_resched())
727 cond_resched();
728 rth = rt_hash_table[i].chain;
729 if (!rth)
730 continue;
731
732 spin_lock_bh(rt_hash_lock_addr(i));
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700733#ifdef CONFIG_NET_NS
734 {
735 struct rtable ** prev, * p;
736
737 rth = rt_hash_table[i].chain;
738
739 /* defer releasing the head of the list after spin_unlock */
740 for (tail = rth; tail; tail = tail->u.dst.rt_next)
741 if (!rt_is_expired(tail))
742 break;
743 if (rth != tail)
744 rt_hash_table[i].chain = tail;
745
746 /* call rt_free on entries after the tail requiring flush */
747 prev = &rt_hash_table[i].chain;
748 for (p = *prev; p; p = next) {
749 next = p->u.dst.rt_next;
750 if (!rt_is_expired(p)) {
751 prev = &p->u.dst.rt_next;
752 } else {
753 *prev = next;
754 rt_free(p);
755 }
756 }
757 }
758#else
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800759 rth = rt_hash_table[i].chain;
760 rt_hash_table[i].chain = NULL;
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700761 tail = NULL;
762#endif
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800763 spin_unlock_bh(rt_hash_lock_addr(i));
764
Denis V. Lunev32cb5b42008-07-05 19:06:12 -0700765 for (; rth != tail; rth = next) {
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800766 next = rth->u.dst.rt_next;
767 rt_free(rth);
768 }
769 }
770}
771
Neil Horman1080d702008-10-27 12:28:25 -0700772/*
773 * While freeing expired entries, we compute average chain length
774 * and standard deviation, using fixed-point arithmetic.
775 * This to have an estimation of rt_chain_length_max
776 * rt_chain_length_max = max(elasticity, AVG + 4*SD)
777 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
778 */
779
780#define FRACT_BITS 3
781#define ONE (1UL << FRACT_BITS)
782
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800783static void rt_check_expire(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700784{
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700785 static unsigned int rover;
786 unsigned int i = rover, goal;
Eric Dumazet1ddbcb02009-05-19 20:14:28 +0000787 struct rtable *rth, *aux, **rthp;
Eric Dumazetcf8da762009-05-19 18:54:22 +0000788 unsigned long samples = 0;
Neil Horman1080d702008-10-27 12:28:25 -0700789 unsigned long sum = 0, sum2 = 0;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700790 u64 mult;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700791
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700792 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
793 if (ip_rt_gc_timeout > 1)
794 do_div(mult, ip_rt_gc_timeout);
795 goal = (unsigned int)mult;
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700796 if (goal > rt_hash_mask)
797 goal = rt_hash_mask + 1;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700798 for (; goal > 0; goal--) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700799 unsigned long tmo = ip_rt_gc_timeout;
Eric Dumazetcf8da762009-05-19 18:54:22 +0000800 unsigned long length;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700801
802 i = (i + 1) & rt_hash_mask;
803 rthp = &rt_hash_table[i].chain;
804
Eric Dumazetd90bf5a2007-11-14 16:14:05 -0800805 if (need_resched())
806 cond_resched();
807
Neil Horman1080d702008-10-27 12:28:25 -0700808 samples++;
809
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700810 if (*rthp == NULL)
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700811 continue;
Eric Dumazetcf8da762009-05-19 18:54:22 +0000812 length = 0;
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700813 spin_lock_bh(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700814 while ((rth = *rthp) != NULL) {
Eric Dumazet1ddbcb02009-05-19 20:14:28 +0000815 prefetch(rth->u.dst.rt_next);
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700816 if (rt_is_expired(rth)) {
Eric Dumazet29e75252008-01-31 17:05:09 -0800817 *rthp = rth->u.dst.rt_next;
818 rt_free(rth);
819 continue;
820 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700821 if (rth->u.dst.expires) {
822 /* Entry is expired even if it is in use */
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700823 if (time_before_eq(jiffies, rth->u.dst.expires)) {
Eric Dumazet1ddbcb02009-05-19 20:14:28 +0000824nofree:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700825 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800826 rthp = &rth->u.dst.rt_next;
Neil Horman1080d702008-10-27 12:28:25 -0700827 /*
Eric Dumazet1ddbcb02009-05-19 20:14:28 +0000828 * We only count entries on
Neil Horman1080d702008-10-27 12:28:25 -0700829 * a chain with equal hash inputs once
830 * so that entries for different QOS
831 * levels, and other non-hash input
832 * attributes don't unfairly skew
833 * the length computation
834 */
Eric Dumazet1ddbcb02009-05-19 20:14:28 +0000835 for (aux = rt_hash_table[i].chain;;) {
836 if (aux == rth) {
837 length += ONE;
838 break;
839 }
840 if (compare_hash_inputs(&aux->fl, &rth->fl))
841 break;
842 aux = aux->u.dst.rt_next;
843 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700844 continue;
845 }
Eric Dumazet1ddbcb02009-05-19 20:14:28 +0000846 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
847 goto nofree;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700848
849 /* Cleanup aged off entries. */
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800850 *rthp = rth->u.dst.rt_next;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900851 rt_free(rth);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700852 }
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700853 spin_unlock_bh(rt_hash_lock_addr(i));
Neil Horman1080d702008-10-27 12:28:25 -0700854 sum += length;
855 sum2 += length*length;
856 }
857 if (samples) {
858 unsigned long avg = sum / samples;
859 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
860 rt_chain_length_max = max_t(unsigned long,
861 ip_rt_gc_elasticity,
862 (avg + 4*sd) >> FRACT_BITS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700863 }
864 rover = i;
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800865}
866
867/*
868 * rt_worker_func() is run in process context.
Eric Dumazet29e75252008-01-31 17:05:09 -0800869 * we call rt_check_expire() to scan part of the hash table
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800870 */
871static void rt_worker_func(struct work_struct *work)
872{
Eric Dumazet29e75252008-01-31 17:05:09 -0800873 rt_check_expire();
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700874 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700875}
876
Eric Dumazet29e75252008-01-31 17:05:09 -0800877/*
878 * Pertubation of rt_genid by a small quantity [1..256]
879 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
880 * many times (2^24) without giving recent rt_genid.
881 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700882 */
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700883static void rt_cache_invalidate(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700884{
Eric Dumazet29e75252008-01-31 17:05:09 -0800885 unsigned char shuffle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700886
Eric Dumazet29e75252008-01-31 17:05:09 -0800887 get_random_bytes(&shuffle, sizeof(shuffle));
Denis V. Luneve84f84f2008-07-05 19:04:32 -0700888 atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700889}
890
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800891/*
Eric Dumazet29e75252008-01-31 17:05:09 -0800892 * delay < 0 : invalidate cache (fast : entries will be deleted later)
893 * delay >= 0 : invalidate & flush cache (can be long)
894 */
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -0700895void rt_cache_flush(struct net *net, int delay)
Eric Dumazet29e75252008-01-31 17:05:09 -0800896{
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700897 rt_cache_invalidate(net);
Eric Dumazet29e75252008-01-31 17:05:09 -0800898 if (delay >= 0)
899 rt_do_flush(!in_softirq());
900}
901
902/*
903 * We change rt_genid and let gc do the cleanup
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800904 */
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -0700905static void rt_secret_rebuild(unsigned long __net)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700906{
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -0700907 struct net *net = (struct net *)__net;
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700908 rt_cache_invalidate(net);
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -0700909 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700910}
911
Neil Horman1080d702008-10-27 12:28:25 -0700912static void rt_secret_rebuild_oneshot(struct net *net)
913{
914 del_timer_sync(&net->ipv4.rt_secret_timer);
915 rt_cache_invalidate(net);
916 if (ip_rt_secret_interval) {
917 net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval;
918 add_timer(&net->ipv4.rt_secret_timer);
919 }
920}
921
922static void rt_emergency_hash_rebuild(struct net *net)
923{
924 if (net_ratelimit()) {
925 printk(KERN_WARNING "Route hash chain too long!\n");
926 printk(KERN_WARNING "Adjust your secret_interval!\n");
927 }
928
929 rt_secret_rebuild_oneshot(net);
930}
931
Linus Torvalds1da177e2005-04-16 15:20:36 -0700932/*
933 Short description of GC goals.
934
935 We want to build algorithm, which will keep routing cache
936 at some equilibrium point, when number of aged off entries
937 is kept approximately equal to newly generated ones.
938
939 Current expiration strength is variable "expire".
940 We try to adjust it dynamically, so that if networking
941 is idle expires is large enough to keep enough of warm entries,
942 and when load increases it reduces to limit cache size.
943 */
944
Daniel Lezcano569d3642008-01-18 03:56:57 -0800945static int rt_garbage_collect(struct dst_ops *ops)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700946{
947 static unsigned long expire = RT_GC_TIMEOUT;
948 static unsigned long last_gc;
949 static int rover;
950 static int equilibrium;
951 struct rtable *rth, **rthp;
952 unsigned long now = jiffies;
953 int goal;
954
955 /*
956 * Garbage collection is pretty expensive,
957 * do not make it too frequently.
958 */
959
960 RT_CACHE_STAT_INC(gc_total);
961
962 if (now - last_gc < ip_rt_gc_min_interval &&
963 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
964 RT_CACHE_STAT_INC(gc_ignored);
965 goto out;
966 }
967
968 /* Calculate number of entries, which we want to expire now. */
969 goal = atomic_read(&ipv4_dst_ops.entries) -
970 (ip_rt_gc_elasticity << rt_hash_log);
971 if (goal <= 0) {
972 if (equilibrium < ipv4_dst_ops.gc_thresh)
973 equilibrium = ipv4_dst_ops.gc_thresh;
974 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
975 if (goal > 0) {
Eric Dumazetb790ced2007-12-21 01:49:07 -0800976 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700977 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
978 }
979 } else {
980 /* We are in dangerous area. Try to reduce cache really
981 * aggressively.
982 */
Eric Dumazetb790ced2007-12-21 01:49:07 -0800983 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700984 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
985 }
986
987 if (now - last_gc >= ip_rt_gc_min_interval)
988 last_gc = now;
989
990 if (goal <= 0) {
991 equilibrium += goal;
992 goto work_done;
993 }
994
995 do {
996 int i, k;
997
998 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
999 unsigned long tmo = expire;
1000
1001 k = (k + 1) & rt_hash_mask;
1002 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001003 spin_lock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001004 while ((rth = *rthp) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001005 if (!rt_is_expired(rth) &&
Eric Dumazet29e75252008-01-31 17:05:09 -08001006 !rt_may_expire(rth, tmo, expire)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001007 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001008 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001009 continue;
1010 }
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001011 *rthp = rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001012 rt_free(rth);
1013 goal--;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001014 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001015 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001016 if (goal <= 0)
1017 break;
1018 }
1019 rover = k;
1020
1021 if (goal <= 0)
1022 goto work_done;
1023
1024 /* Goal is not achieved. We stop process if:
1025
1026 - if expire reduced to zero. Otherwise, expire is halfed.
1027 - if table is not full.
1028 - if we are called from interrupt.
1029 - jiffies check is just fallback/debug loop breaker.
1030 We will not spin here for long time in any case.
1031 */
1032
1033 RT_CACHE_STAT_INC(gc_goal_miss);
1034
1035 if (expire == 0)
1036 break;
1037
1038 expire >>= 1;
1039#if RT_CACHE_DEBUG >= 2
1040 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1041 atomic_read(&ipv4_dst_ops.entries), goal, i);
1042#endif
1043
1044 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1045 goto out;
1046 } while (!in_softirq() && time_before_eq(jiffies, now));
1047
1048 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1049 goto out;
1050 if (net_ratelimit())
1051 printk(KERN_WARNING "dst cache overflow\n");
1052 RT_CACHE_STAT_INC(gc_dst_overflow);
1053 return 1;
1054
1055work_done:
1056 expire += ip_rt_gc_min_interval;
1057 if (expire > ip_rt_gc_timeout ||
1058 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1059 expire = ip_rt_gc_timeout;
1060#if RT_CACHE_DEBUG >= 2
1061 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1062 atomic_read(&ipv4_dst_ops.entries), goal, rover);
1063#endif
1064out: return 0;
1065}
1066
Eric Dumazet511c3f92009-06-02 05:14:27 +00001067static int rt_intern_hash(unsigned hash, struct rtable *rt,
1068 struct rtable **rp, struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001069{
1070 struct rtable *rth, **rthp;
1071 unsigned long now;
1072 struct rtable *cand, **candp;
1073 u32 min_score;
1074 int chain_length;
1075 int attempts = !in_softirq();
1076
1077restart:
1078 chain_length = 0;
1079 min_score = ~(u32)0;
1080 cand = NULL;
1081 candp = NULL;
1082 now = jiffies;
1083
Neil Horman1080d702008-10-27 12:28:25 -07001084 if (!rt_caching(dev_net(rt->u.dst.dev))) {
1085 rt_drop(rt);
1086 return 0;
1087 }
1088
Linus Torvalds1da177e2005-04-16 15:20:36 -07001089 rthp = &rt_hash_table[hash].chain;
1090
Eric Dumazet22c047c2005-07-05 14:55:24 -07001091 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001092 while ((rth = *rthp) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001093 if (rt_is_expired(rth)) {
Eric Dumazet29e75252008-01-31 17:05:09 -08001094 *rthp = rth->u.dst.rt_next;
1095 rt_free(rth);
1096 continue;
1097 }
Denis V. Lunevb5921912008-01-22 23:50:25 -08001098 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001099 /* Put it first */
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001100 *rthp = rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001101 /*
1102 * Since lookup is lockfree, the deletion
1103 * must be visible to another weakly ordered CPU before
1104 * the insertion at the start of the hash chain.
1105 */
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001106 rcu_assign_pointer(rth->u.dst.rt_next,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001107 rt_hash_table[hash].chain);
1108 /*
1109 * Since lookup is lockfree, the update writes
1110 * must be ordered for consistency on SMP.
1111 */
1112 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1113
Pavel Emelyanov03f49f32007-11-10 21:28:34 -08001114 dst_use(&rth->u.dst, now);
Eric Dumazet22c047c2005-07-05 14:55:24 -07001115 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001116
1117 rt_drop(rt);
Eric Dumazet511c3f92009-06-02 05:14:27 +00001118 if (rp)
1119 *rp = rth;
1120 else
Eric Dumazetadf30902009-06-02 05:19:30 +00001121 skb_dst_set(skb, &rth->u.dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001122 return 0;
1123 }
1124
1125 if (!atomic_read(&rth->u.dst.__refcnt)) {
1126 u32 score = rt_score(rth);
1127
1128 if (score <= min_score) {
1129 cand = rth;
1130 candp = rthp;
1131 min_score = score;
1132 }
1133 }
1134
1135 chain_length++;
1136
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001137 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001138 }
1139
1140 if (cand) {
1141 /* ip_rt_gc_elasticity used to be average length of chain
1142 * length, when exceeded gc becomes really aggressive.
1143 *
1144 * The second limit is less certain. At the moment it allows
1145 * only 2 entries per bucket. We will see.
1146 */
1147 if (chain_length > ip_rt_gc_elasticity) {
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001148 *candp = cand->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001149 rt_free(cand);
1150 }
Neil Horman1080d702008-10-27 12:28:25 -07001151 } else {
1152 if (chain_length > rt_chain_length_max) {
1153 struct net *net = dev_net(rt->u.dst.dev);
1154 int num = ++net->ipv4.current_rt_cache_rebuild_count;
1155 if (!rt_caching(dev_net(rt->u.dst.dev))) {
1156 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1157 rt->u.dst.dev->name, num);
1158 }
1159 rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
1160 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001161 }
1162
1163 /* Try to bind route to arp only if it is output
1164 route or unicast forwarding path.
1165 */
1166 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1167 int err = arp_bind_neighbour(&rt->u.dst);
1168 if (err) {
Eric Dumazet22c047c2005-07-05 14:55:24 -07001169 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001170
1171 if (err != -ENOBUFS) {
1172 rt_drop(rt);
1173 return err;
1174 }
1175
1176 /* Neighbour tables are full and nothing
1177 can be released. Try to shrink route cache,
1178 it is most likely it holds some neighbour records.
1179 */
1180 if (attempts-- > 0) {
1181 int saved_elasticity = ip_rt_gc_elasticity;
1182 int saved_int = ip_rt_gc_min_interval;
1183 ip_rt_gc_elasticity = 1;
1184 ip_rt_gc_min_interval = 0;
Daniel Lezcano569d3642008-01-18 03:56:57 -08001185 rt_garbage_collect(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001186 ip_rt_gc_min_interval = saved_int;
1187 ip_rt_gc_elasticity = saved_elasticity;
1188 goto restart;
1189 }
1190
1191 if (net_ratelimit())
1192 printk(KERN_WARNING "Neighbour table overflow.\n");
1193 rt_drop(rt);
1194 return -ENOBUFS;
1195 }
1196 }
1197
Eric Dumazet1ddbcb02009-05-19 20:14:28 +00001198 rt->u.dst.rt_next = rt_hash_table[hash].chain;
Neil Horman1080d702008-10-27 12:28:25 -07001199
Linus Torvalds1da177e2005-04-16 15:20:36 -07001200#if RT_CACHE_DEBUG >= 2
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001201 if (rt->u.dst.rt_next) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001202 struct rtable *trt;
Harvey Harrison673d57e2008-10-31 00:53:57 -07001203 printk(KERN_DEBUG "rt_cache @%02x: %pI4", hash, &rt->rt_dst);
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001204 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
Harvey Harrison673d57e2008-10-31 00:53:57 -07001205 printk(" . %pI4", &trt->rt_dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001206 printk("\n");
1207 }
1208#endif
Eric Dumazet00269b52008-10-16 14:18:29 -07001209 /*
1210 * Since lookup is lockfree, we must make sure
1211 * previous writes to rt are comitted to memory
1212 * before making rt visible to other CPUS.
1213 */
Eric Dumazet1ddbcb02009-05-19 20:14:28 +00001214 rcu_assign_pointer(rt_hash_table[hash].chain, rt);
Neil Horman1080d702008-10-27 12:28:25 -07001215
Eric Dumazet22c047c2005-07-05 14:55:24 -07001216 spin_unlock_bh(rt_hash_lock_addr(hash));
Eric Dumazet511c3f92009-06-02 05:14:27 +00001217 if (rp)
1218 *rp = rt;
1219 else
Eric Dumazetadf30902009-06-02 05:19:30 +00001220 skb_dst_set(skb, &rt->u.dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001221 return 0;
1222}
1223
1224void rt_bind_peer(struct rtable *rt, int create)
1225{
1226 static DEFINE_SPINLOCK(rt_peer_lock);
1227 struct inet_peer *peer;
1228
1229 peer = inet_getpeer(rt->rt_dst, create);
1230
1231 spin_lock_bh(&rt_peer_lock);
1232 if (rt->peer == NULL) {
1233 rt->peer = peer;
1234 peer = NULL;
1235 }
1236 spin_unlock_bh(&rt_peer_lock);
1237 if (peer)
1238 inet_putpeer(peer);
1239}
1240
1241/*
1242 * Peer allocation may fail only in serious out-of-memory conditions. However
1243 * we still can generate some output.
1244 * Random ID selection looks a bit dangerous because we have no chances to
1245 * select ID being unique in a reasonable period of time.
1246 * But broken packet identifier may be better than no packet at all.
1247 */
1248static void ip_select_fb_ident(struct iphdr *iph)
1249{
1250 static DEFINE_SPINLOCK(ip_fb_id_lock);
1251 static u32 ip_fallback_id;
1252 u32 salt;
1253
1254 spin_lock_bh(&ip_fb_id_lock);
Al Viroe4485152006-09-26 22:15:01 -07001255 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001256 iph->id = htons(salt & 0xFFFF);
1257 ip_fallback_id = salt;
1258 spin_unlock_bh(&ip_fb_id_lock);
1259}
1260
1261void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1262{
1263 struct rtable *rt = (struct rtable *) dst;
1264
1265 if (rt) {
1266 if (rt->peer == NULL)
1267 rt_bind_peer(rt, 1);
1268
1269 /* If peer is attached to destination, it is never detached,
1270 so that we need not to grab a lock to dereference it.
1271 */
1272 if (rt->peer) {
1273 iph->id = htons(inet_getid(rt->peer, more));
1274 return;
1275 }
1276 } else
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001277 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
Stephen Hemminger9c2b3322005-04-19 22:39:42 -07001278 __builtin_return_address(0));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001279
1280 ip_select_fb_ident(iph);
1281}
1282
1283static void rt_del(unsigned hash, struct rtable *rt)
1284{
Eric Dumazet29e75252008-01-31 17:05:09 -08001285 struct rtable **rthp, *aux;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001286
Eric Dumazet29e75252008-01-31 17:05:09 -08001287 rthp = &rt_hash_table[hash].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001288 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001289 ip_rt_put(rt);
Eric Dumazet29e75252008-01-31 17:05:09 -08001290 while ((aux = *rthp) != NULL) {
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001291 if (aux == rt || rt_is_expired(aux)) {
Eric Dumazet29e75252008-01-31 17:05:09 -08001292 *rthp = aux->u.dst.rt_next;
1293 rt_free(aux);
1294 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001295 }
Eric Dumazet29e75252008-01-31 17:05:09 -08001296 rthp = &aux->u.dst.rt_next;
1297 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001298 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001299}
1300
Al Virof7655222006-09-26 21:25:43 -07001301void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1302 __be32 saddr, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001303{
1304 int i, k;
1305 struct in_device *in_dev = in_dev_get(dev);
1306 struct rtable *rth, **rthp;
Al Virof7655222006-09-26 21:25:43 -07001307 __be32 skeys[2] = { saddr, 0 };
Linus Torvalds1da177e2005-04-16 15:20:36 -07001308 int ikeys[2] = { dev->ifindex, 0 };
Tom Tucker8d717402006-07-30 20:43:36 -07001309 struct netevent_redirect netevent;
Denis V. Lunev317805b2008-02-28 20:50:06 -08001310 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001311
Linus Torvalds1da177e2005-04-16 15:20:36 -07001312 if (!in_dev)
1313 return;
1314
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001315 net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001316 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001317 || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
Joe Perchesf97c1e02007-12-16 13:45:43 -08001318 || ipv4_is_zeronet(new_gw))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001319 goto reject_redirect;
1320
Neil Horman1080d702008-10-27 12:28:25 -07001321 if (!rt_caching(net))
1322 goto reject_redirect;
1323
Linus Torvalds1da177e2005-04-16 15:20:36 -07001324 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1325 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1326 goto reject_redirect;
1327 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1328 goto reject_redirect;
1329 } else {
Denis V. Lunev317805b2008-02-28 20:50:06 -08001330 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001331 goto reject_redirect;
1332 }
1333
1334 for (i = 0; i < 2; i++) {
1335 for (k = 0; k < 2; k++) {
Denis V. Lunevb00180d2008-07-05 19:04:09 -07001336 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001337 rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001338
1339 rthp=&rt_hash_table[hash].chain;
1340
1341 rcu_read_lock();
1342 while ((rth = rcu_dereference(*rthp)) != NULL) {
1343 struct rtable *rt;
1344
1345 if (rth->fl.fl4_dst != daddr ||
1346 rth->fl.fl4_src != skeys[i] ||
Linus Torvalds1da177e2005-04-16 15:20:36 -07001347 rth->fl.oif != ikeys[k] ||
Eric Dumazet29e75252008-01-31 17:05:09 -08001348 rth->fl.iif != 0 ||
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001349 rt_is_expired(rth) ||
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09001350 !net_eq(dev_net(rth->u.dst.dev), net)) {
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001351 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001352 continue;
1353 }
1354
1355 if (rth->rt_dst != daddr ||
1356 rth->rt_src != saddr ||
1357 rth->u.dst.error ||
1358 rth->rt_gateway != old_gw ||
1359 rth->u.dst.dev != dev)
1360 break;
1361
1362 dst_hold(&rth->u.dst);
1363 rcu_read_unlock();
1364
1365 rt = dst_alloc(&ipv4_dst_ops);
1366 if (rt == NULL) {
1367 ip_rt_put(rth);
1368 in_dev_put(in_dev);
1369 return;
1370 }
1371
1372 /* Copy all the information. */
1373 *rt = *rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001374 rt->u.dst.__use = 1;
1375 atomic_set(&rt->u.dst.__refcnt, 1);
1376 rt->u.dst.child = NULL;
1377 if (rt->u.dst.dev)
1378 dev_hold(rt->u.dst.dev);
1379 if (rt->idev)
1380 in_dev_hold(rt->idev);
1381 rt->u.dst.obsolete = 0;
1382 rt->u.dst.lastuse = jiffies;
1383 rt->u.dst.path = &rt->u.dst;
1384 rt->u.dst.neighbour = NULL;
1385 rt->u.dst.hh = NULL;
Alexey Dobriyandef8b4f2008-10-28 13:24:06 -07001386#ifdef CONFIG_XFRM
Linus Torvalds1da177e2005-04-16 15:20:36 -07001387 rt->u.dst.xfrm = NULL;
Alexey Dobriyandef8b4f2008-10-28 13:24:06 -07001388#endif
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001389 rt->rt_genid = rt_genid(net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001390 rt->rt_flags |= RTCF_REDIRECTED;
1391
1392 /* Gateway is different ... */
1393 rt->rt_gateway = new_gw;
1394
1395 /* Redirect received -> path was valid */
1396 dst_confirm(&rth->u.dst);
1397
1398 if (rt->peer)
1399 atomic_inc(&rt->peer->refcnt);
1400
1401 if (arp_bind_neighbour(&rt->u.dst) ||
1402 !(rt->u.dst.neighbour->nud_state &
1403 NUD_VALID)) {
1404 if (rt->u.dst.neighbour)
1405 neigh_event_send(rt->u.dst.neighbour, NULL);
1406 ip_rt_put(rth);
1407 rt_drop(rt);
1408 goto do_next;
1409 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001410
Tom Tucker8d717402006-07-30 20:43:36 -07001411 netevent.old = &rth->u.dst;
1412 netevent.new = &rt->u.dst;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001413 call_netevent_notifiers(NETEVENT_REDIRECT,
1414 &netevent);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001415
1416 rt_del(hash, rth);
Eric Dumazet511c3f92009-06-02 05:14:27 +00001417 if (!rt_intern_hash(hash, rt, &rt, NULL))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001418 ip_rt_put(rt);
1419 goto do_next;
1420 }
1421 rcu_read_unlock();
1422 do_next:
1423 ;
1424 }
1425 }
1426 in_dev_put(in_dev);
1427 return;
1428
1429reject_redirect:
1430#ifdef CONFIG_IP_ROUTE_VERBOSE
1431 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07001432 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1433 " Advised path = %pI4 -> %pI4\n",
1434 &old_gw, dev->name, &new_gw,
1435 &saddr, &daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001436#endif
1437 in_dev_put(in_dev);
1438}
1439
1440static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1441{
Eric Dumazetee6b9672008-03-05 18:30:47 -08001442 struct rtable *rt = (struct rtable *)dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001443 struct dst_entry *ret = dst;
1444
1445 if (rt) {
1446 if (dst->obsolete) {
1447 ip_rt_put(rt);
1448 ret = NULL;
1449 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1450 rt->u.dst.expires) {
Al Viro8c7bc842006-09-26 21:26:19 -07001451 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
Denis V. Lunevb00180d2008-07-05 19:04:09 -07001452 rt->fl.oif,
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001453 rt_genid(dev_net(dst->dev)));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001454#if RT_CACHE_DEBUG >= 1
Harvey Harrison673d57e2008-10-31 00:53:57 -07001455 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1456 &rt->rt_dst, rt->fl.fl4_tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001457#endif
1458 rt_del(hash, rt);
1459 ret = NULL;
1460 }
1461 }
1462 return ret;
1463}
1464
1465/*
1466 * Algorithm:
1467 * 1. The first ip_rt_redirect_number redirects are sent
1468 * with exponential backoff, then we stop sending them at all,
1469 * assuming that the host ignores our redirects.
1470 * 2. If we did not see packets requiring redirects
1471 * during ip_rt_redirect_silence, we assume that the host
1472 * forgot redirected route and start to send redirects again.
1473 *
1474 * This algorithm is much cheaper and more intelligent than dumb load limiting
1475 * in icmp.c.
1476 *
1477 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1478 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1479 */
1480
1481void ip_rt_send_redirect(struct sk_buff *skb)
1482{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001483 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001484 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1485
1486 if (!in_dev)
1487 return;
1488
1489 if (!IN_DEV_TX_REDIRECTS(in_dev))
1490 goto out;
1491
1492 /* No redirected packets during ip_rt_redirect_silence;
1493 * reset the algorithm.
1494 */
1495 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1496 rt->u.dst.rate_tokens = 0;
1497
1498 /* Too many ignored redirects; do not send anything
1499 * set u.dst.rate_last to the last seen redirected packet.
1500 */
1501 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1502 rt->u.dst.rate_last = jiffies;
1503 goto out;
1504 }
1505
1506 /* Check for load limit; set rate_last to the latest sent
1507 * redirect.
1508 */
Li Yewang14fb8a72006-12-18 00:26:35 -08001509 if (rt->u.dst.rate_tokens == 0 ||
1510 time_after(jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001511 (rt->u.dst.rate_last +
1512 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1513 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1514 rt->u.dst.rate_last = jiffies;
1515 ++rt->u.dst.rate_tokens;
1516#ifdef CONFIG_IP_ROUTE_VERBOSE
1517 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1518 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1519 net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07001520 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1521 &rt->rt_src, rt->rt_iif,
1522 &rt->rt_dst, &rt->rt_gateway);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001523#endif
1524 }
1525out:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001526 in_dev_put(in_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001527}
1528
1529static int ip_error(struct sk_buff *skb)
1530{
Eric Dumazet511c3f92009-06-02 05:14:27 +00001531 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001532 unsigned long now;
1533 int code;
1534
1535 switch (rt->u.dst.error) {
1536 case EINVAL:
1537 default:
1538 goto out;
1539 case EHOSTUNREACH:
1540 code = ICMP_HOST_UNREACH;
1541 break;
1542 case ENETUNREACH:
1543 code = ICMP_NET_UNREACH;
Pavel Emelyanov7c73a6f2008-07-16 20:20:11 -07001544 IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1545 IPSTATS_MIB_INNOROUTES);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001546 break;
1547 case EACCES:
1548 code = ICMP_PKT_FILTERED;
1549 break;
1550 }
1551
1552 now = jiffies;
1553 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1554 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1555 rt->u.dst.rate_tokens = ip_rt_error_burst;
1556 rt->u.dst.rate_last = now;
1557 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1558 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1559 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1560 }
1561
1562out: kfree_skb(skb);
1563 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001564}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001565
1566/*
1567 * The last two values are not from the RFC but
1568 * are needed for AMPRnet AX.25 paths.
1569 */
1570
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001571static const unsigned short mtu_plateau[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -07001572{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1573
Stephen Hemminger5969f712008-04-10 01:52:09 -07001574static inline unsigned short guess_mtu(unsigned short old_mtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001575{
1576 int i;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001577
Linus Torvalds1da177e2005-04-16 15:20:36 -07001578 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1579 if (old_mtu > mtu_plateau[i])
1580 return mtu_plateau[i];
1581 return 68;
1582}
1583
Denis V. Lunevb5921912008-01-22 23:50:25 -08001584unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
Timo Teras0010e462008-04-29 03:32:25 -07001585 unsigned short new_mtu,
1586 struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001587{
Timo Teras0010e462008-04-29 03:32:25 -07001588 int i, k;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001589 unsigned short old_mtu = ntohs(iph->tot_len);
1590 struct rtable *rth;
Timo Teras0010e462008-04-29 03:32:25 -07001591 int ikeys[2] = { dev->ifindex, 0 };
Al Viroe4485152006-09-26 22:15:01 -07001592 __be32 skeys[2] = { iph->saddr, 0, };
1593 __be32 daddr = iph->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001594 unsigned short est_mtu = 0;
1595
1596 if (ipv4_config.no_pmtu_disc)
1597 return 0;
1598
Timo Teras0010e462008-04-29 03:32:25 -07001599 for (k = 0; k < 2; k++) {
1600 for (i = 0; i < 2; i++) {
Denis V. Lunevb00180d2008-07-05 19:04:09 -07001601 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001602 rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001603
Timo Teras0010e462008-04-29 03:32:25 -07001604 rcu_read_lock();
1605 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1606 rth = rcu_dereference(rth->u.dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001607 unsigned short mtu = new_mtu;
1608
Timo Teras0010e462008-04-29 03:32:25 -07001609 if (rth->fl.fl4_dst != daddr ||
1610 rth->fl.fl4_src != skeys[i] ||
1611 rth->rt_dst != daddr ||
1612 rth->rt_src != iph->saddr ||
1613 rth->fl.oif != ikeys[k] ||
1614 rth->fl.iif != 0 ||
1615 dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1616 !net_eq(dev_net(rth->u.dst.dev), net) ||
Hugh Dickins6c3b8fc2008-07-26 17:51:06 -07001617 rt_is_expired(rth))
Timo Teras0010e462008-04-29 03:32:25 -07001618 continue;
1619
Linus Torvalds1da177e2005-04-16 15:20:36 -07001620 if (new_mtu < 68 || new_mtu >= old_mtu) {
1621
1622 /* BSD 4.2 compatibility hack :-( */
1623 if (mtu == 0 &&
Rami Rosen6d273f82008-08-06 02:33:49 -07001624 old_mtu >= dst_mtu(&rth->u.dst) &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001625 old_mtu >= 68 + (iph->ihl << 2))
1626 old_mtu -= iph->ihl << 2;
1627
1628 mtu = guess_mtu(old_mtu);
1629 }
Rami Rosen6d273f82008-08-06 02:33:49 -07001630 if (mtu <= dst_mtu(&rth->u.dst)) {
1631 if (mtu < dst_mtu(&rth->u.dst)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001632 dst_confirm(&rth->u.dst);
1633 if (mtu < ip_rt_min_pmtu) {
1634 mtu = ip_rt_min_pmtu;
1635 rth->u.dst.metrics[RTAX_LOCK-1] |=
1636 (1 << RTAX_MTU);
1637 }
1638 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1639 dst_set_expires(&rth->u.dst,
1640 ip_rt_mtu_expires);
1641 }
1642 est_mtu = mtu;
1643 }
1644 }
Timo Teras0010e462008-04-29 03:32:25 -07001645 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001646 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001647 }
1648 return est_mtu ? : new_mtu;
1649}
1650
1651static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1652{
Rami Rosen6d273f82008-08-06 02:33:49 -07001653 if (dst_mtu(dst) > mtu && mtu >= 68 &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001654 !(dst_metric_locked(dst, RTAX_MTU))) {
1655 if (mtu < ip_rt_min_pmtu) {
1656 mtu = ip_rt_min_pmtu;
1657 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1658 }
1659 dst->metrics[RTAX_MTU-1] = mtu;
1660 dst_set_expires(dst, ip_rt_mtu_expires);
Tom Tucker8d717402006-07-30 20:43:36 -07001661 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001662 }
1663}
1664
1665static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1666{
1667 return NULL;
1668}
1669
1670static void ipv4_dst_destroy(struct dst_entry *dst)
1671{
1672 struct rtable *rt = (struct rtable *) dst;
1673 struct inet_peer *peer = rt->peer;
1674 struct in_device *idev = rt->idev;
1675
1676 if (peer) {
1677 rt->peer = NULL;
1678 inet_putpeer(peer);
1679 }
1680
1681 if (idev) {
1682 rt->idev = NULL;
1683 in_dev_put(idev);
1684 }
1685}
1686
1687static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1688 int how)
1689{
1690 struct rtable *rt = (struct rtable *) dst;
1691 struct in_device *idev = rt->idev;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001692 if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
Denis V. Lunev5a3e55d2007-12-07 00:38:10 -08001693 struct in_device *loopback_idev =
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001694 in_dev_get(dev_net(dev)->loopback_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001695 if (loopback_idev) {
1696 rt->idev = loopback_idev;
1697 in_dev_put(idev);
1698 }
1699 }
1700}
1701
1702static void ipv4_link_failure(struct sk_buff *skb)
1703{
1704 struct rtable *rt;
1705
1706 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1707
Eric Dumazet511c3f92009-06-02 05:14:27 +00001708 rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001709 if (rt)
1710 dst_set_expires(&rt->u.dst, 0);
1711}
1712
1713static int ip_rt_bug(struct sk_buff *skb)
1714{
Harvey Harrison673d57e2008-10-31 00:53:57 -07001715 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1716 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001717 skb->dev ? skb->dev->name : "?");
1718 kfree_skb(skb);
1719 return 0;
1720}
1721
1722/*
1723 We do not cache source address of outgoing interface,
1724 because it is used only by IP RR, TS and SRR options,
1725 so that it out of fast path.
1726
1727 BTW remember: "addr" is allowed to be not aligned
1728 in IP options!
1729 */
1730
1731void ip_rt_get_source(u8 *addr, struct rtable *rt)
1732{
Al Viroa61ced52006-09-26 21:27:54 -07001733 __be32 src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001734 struct fib_result res;
1735
1736 if (rt->fl.iif == 0)
1737 src = rt->rt_src;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001738 else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001739 src = FIB_RES_PREFSRC(res);
1740 fib_res_put(&res);
1741 } else
1742 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1743 RT_SCOPE_UNIVERSE);
1744 memcpy(addr, &src, 4);
1745}
1746
1747#ifdef CONFIG_NET_CLS_ROUTE
1748static void set_class_tag(struct rtable *rt, u32 tag)
1749{
1750 if (!(rt->u.dst.tclassid & 0xFFFF))
1751 rt->u.dst.tclassid |= tag & 0xFFFF;
1752 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1753 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1754}
1755#endif
1756
1757static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1758{
1759 struct fib_info *fi = res->fi;
1760
1761 if (fi) {
1762 if (FIB_RES_GW(*res) &&
1763 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1764 rt->rt_gateway = FIB_RES_GW(*res);
1765 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1766 sizeof(rt->u.dst.metrics));
1767 if (fi->fib_mtu == 0) {
1768 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
Satoru SATOH0bbeafd2008-05-04 22:12:43 -07001769 if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001770 rt->rt_gateway != rt->rt_dst &&
1771 rt->u.dst.dev->mtu > 576)
1772 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1773 }
1774#ifdef CONFIG_NET_CLS_ROUTE
1775 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1776#endif
1777 } else
1778 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1779
Satoru SATOH5ffc02a2008-05-04 22:14:42 -07001780 if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001781 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
Rami Rosen6d273f82008-08-06 02:33:49 -07001782 if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001783 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
Satoru SATOH5ffc02a2008-05-04 22:14:42 -07001784 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001785 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1786 ip_rt_min_advmss);
Satoru SATOH5ffc02a2008-05-04 22:14:42 -07001787 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001788 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1789
1790#ifdef CONFIG_NET_CLS_ROUTE
1791#ifdef CONFIG_IP_MULTIPLE_TABLES
1792 set_class_tag(rt, fib_rules_tclass(res));
1793#endif
1794 set_class_tag(rt, itag);
1795#endif
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001796 rt->rt_type = res->type;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001797}
1798
Al Viro9e12bb22006-09-26 21:25:20 -07001799static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001800 u8 tos, struct net_device *dev, int our)
1801{
1802 unsigned hash;
1803 struct rtable *rth;
Al Viroa61ced52006-09-26 21:27:54 -07001804 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001805 struct in_device *in_dev = in_dev_get(dev);
1806 u32 itag = 0;
1807
1808 /* Primary sanity checks. */
1809
1810 if (in_dev == NULL)
1811 return -EINVAL;
1812
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001813 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08001814 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001815 goto e_inval;
1816
Joe Perchesf97c1e02007-12-16 13:45:43 -08001817 if (ipv4_is_zeronet(saddr)) {
1818 if (!ipv4_is_local_multicast(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001819 goto e_inval;
1820 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1821 } else if (fib_validate_source(saddr, 0, tos, 0,
1822 dev, &spec_dst, &itag) < 0)
1823 goto e_inval;
1824
1825 rth = dst_alloc(&ipv4_dst_ops);
1826 if (!rth)
1827 goto e_nobufs;
1828
1829 rth->u.dst.output= ip_rt_bug;
1830
1831 atomic_set(&rth->u.dst.__refcnt, 1);
1832 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07001833 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001834 rth->u.dst.flags |= DST_NOPOLICY;
1835 rth->fl.fl4_dst = daddr;
1836 rth->rt_dst = daddr;
1837 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001838 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001839 rth->fl.fl4_src = saddr;
1840 rth->rt_src = saddr;
1841#ifdef CONFIG_NET_CLS_ROUTE
1842 rth->u.dst.tclassid = itag;
1843#endif
1844 rth->rt_iif =
1845 rth->fl.iif = dev->ifindex;
Eric W. Biederman2774c7a2007-09-26 22:10:56 -07001846 rth->u.dst.dev = init_net.loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001847 dev_hold(rth->u.dst.dev);
1848 rth->idev = in_dev_get(rth->u.dst.dev);
1849 rth->fl.oif = 0;
1850 rth->rt_gateway = daddr;
1851 rth->rt_spec_dst= spec_dst;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001852 rth->rt_genid = rt_genid(dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001853 rth->rt_flags = RTCF_MULTICAST;
Eric Dumazet29e75252008-01-31 17:05:09 -08001854 rth->rt_type = RTN_MULTICAST;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001855 if (our) {
1856 rth->u.dst.input= ip_local_deliver;
1857 rth->rt_flags |= RTCF_LOCAL;
1858 }
1859
1860#ifdef CONFIG_IP_MROUTE
Joe Perchesf97c1e02007-12-16 13:45:43 -08001861 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001862 rth->u.dst.input = ip_mr_input;
1863#endif
1864 RT_CACHE_STAT_INC(in_slow_mc);
1865
1866 in_dev_put(in_dev);
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001867 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
Eric Dumazet511c3f92009-06-02 05:14:27 +00001868 return rt_intern_hash(hash, rth, NULL, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001869
1870e_nobufs:
1871 in_dev_put(in_dev);
1872 return -ENOBUFS;
1873
1874e_inval:
1875 in_dev_put(in_dev);
1876 return -EINVAL;
1877}
1878
1879
1880static void ip_handle_martian_source(struct net_device *dev,
1881 struct in_device *in_dev,
1882 struct sk_buff *skb,
Al Viro9e12bb22006-09-26 21:25:20 -07001883 __be32 daddr,
1884 __be32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001885{
1886 RT_CACHE_STAT_INC(in_martian_src);
1887#ifdef CONFIG_IP_ROUTE_VERBOSE
1888 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1889 /*
1890 * RFC1812 recommendation, if source is martian,
1891 * the only hint is MAC header.
1892 */
Harvey Harrison673d57e2008-10-31 00:53:57 -07001893 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1894 &daddr, &saddr, dev->name);
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001895 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001896 int i;
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001897 const unsigned char *p = skb_mac_header(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001898 printk(KERN_WARNING "ll header: ");
1899 for (i = 0; i < dev->hard_header_len; i++, p++) {
1900 printk("%02x", *p);
1901 if (i < (dev->hard_header_len - 1))
1902 printk(":");
1903 }
1904 printk("\n");
1905 }
1906 }
1907#endif
1908}
1909
Stephen Hemminger5969f712008-04-10 01:52:09 -07001910static int __mkroute_input(struct sk_buff *skb,
1911 struct fib_result *res,
1912 struct in_device *in_dev,
1913 __be32 daddr, __be32 saddr, u32 tos,
1914 struct rtable **result)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001915{
1916
1917 struct rtable *rth;
1918 int err;
1919 struct in_device *out_dev;
1920 unsigned flags = 0;
Al Virod9c9df82006-09-26 21:28:14 -07001921 __be32 spec_dst;
1922 u32 itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001923
1924 /* get a working reference to the output device */
1925 out_dev = in_dev_get(FIB_RES_DEV(*res));
1926 if (out_dev == NULL) {
1927 if (net_ratelimit())
1928 printk(KERN_CRIT "Bug in ip_route_input" \
1929 "_slow(). Please, report\n");
1930 return -EINVAL;
1931 }
1932
1933
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001934 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
Linus Torvalds1da177e2005-04-16 15:20:36 -07001935 in_dev->dev, &spec_dst, &itag);
1936 if (err < 0) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001937 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001938 saddr);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001939
Linus Torvalds1da177e2005-04-16 15:20:36 -07001940 err = -EINVAL;
1941 goto cleanup;
1942 }
1943
1944 if (err)
1945 flags |= RTCF_DIRECTSRC;
1946
Thomas Graf51b77ca2008-06-03 16:36:01 -07001947 if (out_dev == in_dev && err &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001948 (IN_DEV_SHARED_MEDIA(out_dev) ||
1949 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1950 flags |= RTCF_DOREDIRECT;
1951
1952 if (skb->protocol != htons(ETH_P_IP)) {
1953 /* Not IP (i.e. ARP). Do not create route, if it is
1954 * invalid for proxy arp. DNAT routes are always valid.
1955 */
Rami Rosencb7928a2008-01-09 00:18:24 -08001956 if (out_dev == in_dev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001957 err = -EINVAL;
1958 goto cleanup;
1959 }
1960 }
1961
1962
1963 rth = dst_alloc(&ipv4_dst_ops);
1964 if (!rth) {
1965 err = -ENOBUFS;
1966 goto cleanup;
1967 }
1968
Julian Anastasovce723d82005-09-08 13:34:47 -07001969 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001970 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07001971 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001972 rth->u.dst.flags |= DST_NOPOLICY;
Herbert Xu42f811b2007-06-04 23:34:44 -07001973 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001974 rth->u.dst.flags |= DST_NOXFRM;
1975 rth->fl.fl4_dst = daddr;
1976 rth->rt_dst = daddr;
1977 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001978 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001979 rth->fl.fl4_src = saddr;
1980 rth->rt_src = saddr;
1981 rth->rt_gateway = daddr;
1982 rth->rt_iif =
1983 rth->fl.iif = in_dev->dev->ifindex;
1984 rth->u.dst.dev = (out_dev)->dev;
1985 dev_hold(rth->u.dst.dev);
1986 rth->idev = in_dev_get(rth->u.dst.dev);
1987 rth->fl.oif = 0;
1988 rth->rt_spec_dst= spec_dst;
1989
1990 rth->u.dst.input = ip_forward;
1991 rth->u.dst.output = ip_output;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07001992 rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001993
1994 rt_set_nexthop(rth, res, itag);
1995
1996 rth->rt_flags = flags;
1997
1998 *result = rth;
1999 err = 0;
2000 cleanup:
2001 /* release the working reference to the output device */
2002 in_dev_put(out_dev);
2003 return err;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002004}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002005
Stephen Hemminger5969f712008-04-10 01:52:09 -07002006static int ip_mkroute_input(struct sk_buff *skb,
2007 struct fib_result *res,
2008 const struct flowi *fl,
2009 struct in_device *in_dev,
2010 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002011{
Chuck Short7abaa272005-06-22 22:10:23 -07002012 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002013 int err;
2014 unsigned hash;
2015
2016#ifdef CONFIG_IP_ROUTE_MULTIPATH
2017 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2018 fib_select_multipath(fl, res);
2019#endif
2020
2021 /* create a routing cache entry */
2022 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2023 if (err)
2024 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002025
2026 /* put it into the cache */
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002027 hash = rt_hash(daddr, saddr, fl->iif,
2028 rt_genid(dev_net(rth->u.dst.dev)));
Eric Dumazet511c3f92009-06-02 05:14:27 +00002029 return rt_intern_hash(hash, rth, NULL, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002030}
2031
Linus Torvalds1da177e2005-04-16 15:20:36 -07002032/*
2033 * NOTE. We drop all the packets that has local source
2034 * addresses, because every properly looped back packet
2035 * must have correct destination already attached by output routine.
2036 *
2037 * Such approach solves two big problems:
2038 * 1. Not simplex devices are handled properly.
2039 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2040 */
2041
Al Viro9e12bb22006-09-26 21:25:20 -07002042static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002043 u8 tos, struct net_device *dev)
2044{
2045 struct fib_result res;
2046 struct in_device *in_dev = in_dev_get(dev);
2047 struct flowi fl = { .nl_u = { .ip4_u =
2048 { .daddr = daddr,
2049 .saddr = saddr,
2050 .tos = tos,
2051 .scope = RT_SCOPE_UNIVERSE,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002052 } },
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002053 .mark = skb->mark,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002054 .iif = dev->ifindex };
2055 unsigned flags = 0;
2056 u32 itag = 0;
2057 struct rtable * rth;
2058 unsigned hash;
Al Viro9e12bb22006-09-26 21:25:20 -07002059 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002060 int err = -EINVAL;
2061 int free_res = 0;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002062 struct net * net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002063
2064 /* IP on this device is disabled. */
2065
2066 if (!in_dev)
2067 goto out;
2068
2069 /* Check for the most weird martians, which can be not detected
2070 by fib_lookup.
2071 */
2072
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002073 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002074 ipv4_is_loopback(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002075 goto martian_source;
2076
Al Viroe4485152006-09-26 22:15:01 -07002077 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002078 goto brd_input;
2079
2080 /* Accept zero addresses only to limited broadcast;
2081 * I even do not know to fix it or not. Waiting for complains :-)
2082 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002083 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002084 goto martian_source;
2085
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002086 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002087 ipv4_is_loopback(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002088 goto martian_destination;
2089
2090 /*
2091 * Now we are ready to route packet.
2092 */
Denis V. Lunev84a885f2008-01-21 17:34:35 -08002093 if ((err = fib_lookup(net, &fl, &res)) != 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002094 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002095 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002096 goto no_route;
2097 }
2098 free_res = 1;
2099
2100 RT_CACHE_STAT_INC(in_slow_tot);
2101
2102 if (res.type == RTN_BROADCAST)
2103 goto brd_input;
2104
2105 if (res.type == RTN_LOCAL) {
2106 int result;
2107 result = fib_validate_source(saddr, daddr, tos,
Denis V. Lunev84a885f2008-01-21 17:34:35 -08002108 net->loopback_dev->ifindex,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002109 dev, &spec_dst, &itag);
2110 if (result < 0)
2111 goto martian_source;
2112 if (result)
2113 flags |= RTCF_DIRECTSRC;
2114 spec_dst = daddr;
2115 goto local_input;
2116 }
2117
2118 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002119 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002120 if (res.type != RTN_UNICAST)
2121 goto martian_destination;
2122
2123 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002124done:
2125 in_dev_put(in_dev);
2126 if (free_res)
2127 fib_res_put(&res);
2128out: return err;
2129
2130brd_input:
2131 if (skb->protocol != htons(ETH_P_IP))
2132 goto e_inval;
2133
Joe Perchesf97c1e02007-12-16 13:45:43 -08002134 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002135 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2136 else {
2137 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2138 &itag);
2139 if (err < 0)
2140 goto martian_source;
2141 if (err)
2142 flags |= RTCF_DIRECTSRC;
2143 }
2144 flags |= RTCF_BROADCAST;
2145 res.type = RTN_BROADCAST;
2146 RT_CACHE_STAT_INC(in_brd);
2147
2148local_input:
2149 rth = dst_alloc(&ipv4_dst_ops);
2150 if (!rth)
2151 goto e_nobufs;
2152
2153 rth->u.dst.output= ip_rt_bug;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002154 rth->rt_genid = rt_genid(net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002155
2156 atomic_set(&rth->u.dst.__refcnt, 1);
2157 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07002158 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002159 rth->u.dst.flags |= DST_NOPOLICY;
2160 rth->fl.fl4_dst = daddr;
2161 rth->rt_dst = daddr;
2162 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002163 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002164 rth->fl.fl4_src = saddr;
2165 rth->rt_src = saddr;
2166#ifdef CONFIG_NET_CLS_ROUTE
2167 rth->u.dst.tclassid = itag;
2168#endif
2169 rth->rt_iif =
2170 rth->fl.iif = dev->ifindex;
Denis V. Lunev84a885f2008-01-21 17:34:35 -08002171 rth->u.dst.dev = net->loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002172 dev_hold(rth->u.dst.dev);
2173 rth->idev = in_dev_get(rth->u.dst.dev);
2174 rth->rt_gateway = daddr;
2175 rth->rt_spec_dst= spec_dst;
2176 rth->u.dst.input= ip_local_deliver;
2177 rth->rt_flags = flags|RTCF_LOCAL;
2178 if (res.type == RTN_UNREACHABLE) {
2179 rth->u.dst.input= ip_error;
2180 rth->u.dst.error= -err;
2181 rth->rt_flags &= ~RTCF_LOCAL;
2182 }
2183 rth->rt_type = res.type;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002184 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
Eric Dumazet511c3f92009-06-02 05:14:27 +00002185 err = rt_intern_hash(hash, rth, NULL, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002186 goto done;
2187
2188no_route:
2189 RT_CACHE_STAT_INC(in_no_route);
2190 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2191 res.type = RTN_UNREACHABLE;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08002192 if (err == -ESRCH)
2193 err = -ENETUNREACH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002194 goto local_input;
2195
2196 /*
2197 * Do not cache martian addresses: they should be logged (RFC1812)
2198 */
2199martian_destination:
2200 RT_CACHE_STAT_INC(in_martian_dst);
2201#ifdef CONFIG_IP_ROUTE_VERBOSE
2202 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
Harvey Harrison673d57e2008-10-31 00:53:57 -07002203 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2204 &daddr, &saddr, dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002205#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002206
2207e_hostunreach:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002208 err = -EHOSTUNREACH;
2209 goto done;
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002210
Linus Torvalds1da177e2005-04-16 15:20:36 -07002211e_inval:
2212 err = -EINVAL;
2213 goto done;
2214
2215e_nobufs:
2216 err = -ENOBUFS;
2217 goto done;
2218
2219martian_source:
2220 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2221 goto e_inval;
2222}
2223
Al Viro9e12bb22006-09-26 21:25:20 -07002224int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002225 u8 tos, struct net_device *dev)
2226{
2227 struct rtable * rth;
2228 unsigned hash;
2229 int iif = dev->ifindex;
Denis V. Lunevb5921912008-01-22 23:50:25 -08002230 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002231
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002232 net = dev_net(dev);
Neil Horman1080d702008-10-27 12:28:25 -07002233
2234 if (!rt_caching(net))
2235 goto skip_cache;
2236
Linus Torvalds1da177e2005-04-16 15:20:36 -07002237 tos &= IPTOS_RT_MASK;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002238 hash = rt_hash(daddr, saddr, iif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002239
2240 rcu_read_lock();
2241 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08002242 rth = rcu_dereference(rth->u.dst.rt_next)) {
Stephen Hemmingerc0b8c322008-04-10 04:00:28 -07002243 if (((rth->fl.fl4_dst ^ daddr) |
2244 (rth->fl.fl4_src ^ saddr) |
2245 (rth->fl.iif ^ iif) |
2246 rth->fl.oif |
2247 (rth->fl.fl4_tos ^ tos)) == 0 &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002248 rth->fl.mark == skb->mark &&
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09002249 net_eq(dev_net(rth->u.dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002250 !rt_is_expired(rth)) {
Pavel Emelyanov03f49f32007-11-10 21:28:34 -08002251 dst_use(&rth->u.dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002252 RT_CACHE_STAT_INC(in_hit);
2253 rcu_read_unlock();
Eric Dumazetadf30902009-06-02 05:19:30 +00002254 skb_dst_set(skb, &rth->u.dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002255 return 0;
2256 }
2257 RT_CACHE_STAT_INC(in_hlist_search);
2258 }
2259 rcu_read_unlock();
2260
Neil Horman1080d702008-10-27 12:28:25 -07002261skip_cache:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002262 /* Multicast recognition logic is moved from route cache to here.
2263 The problem was that too many Ethernet cards have broken/missing
2264 hardware multicast filters :-( As result the host on multicasting
2265 network acquires a lot of useless route cache entries, sort of
2266 SDR messages from all the world. Now we try to get rid of them.
2267 Really, provided software IP multicast filter is organized
2268 reasonably (at least, hashed), it does not result in a slowdown
2269 comparing with route cache reject entries.
2270 Note, that multicast routers are not affected, because
2271 route cache entry is created eventually.
2272 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002273 if (ipv4_is_multicast(daddr)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002274 struct in_device *in_dev;
2275
2276 rcu_read_lock();
Herbert Xue5ed6392005-10-03 14:35:55 -07002277 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002278 int our = ip_check_mc(in_dev, daddr, saddr,
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07002279 ip_hdr(skb)->protocol);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002280 if (our
2281#ifdef CONFIG_IP_MROUTE
Joe Perchesf97c1e02007-12-16 13:45:43 -08002282 || (!ipv4_is_local_multicast(daddr) &&
2283 IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002284#endif
2285 ) {
2286 rcu_read_unlock();
2287 return ip_route_input_mc(skb, daddr, saddr,
2288 tos, dev, our);
2289 }
2290 }
2291 rcu_read_unlock();
2292 return -EINVAL;
2293 }
2294 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2295}
2296
Stephen Hemminger5969f712008-04-10 01:52:09 -07002297static int __mkroute_output(struct rtable **result,
2298 struct fib_result *res,
2299 const struct flowi *fl,
2300 const struct flowi *oldflp,
2301 struct net_device *dev_out,
2302 unsigned flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002303{
2304 struct rtable *rth;
2305 struct in_device *in_dev;
2306 u32 tos = RT_FL_TOS(oldflp);
2307 int err = 0;
2308
Joe Perchesf97c1e02007-12-16 13:45:43 -08002309 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002310 return -EINVAL;
2311
Al Viroe4485152006-09-26 22:15:01 -07002312 if (fl->fl4_dst == htonl(0xFFFFFFFF))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002313 res->type = RTN_BROADCAST;
Joe Perchesf97c1e02007-12-16 13:45:43 -08002314 else if (ipv4_is_multicast(fl->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002315 res->type = RTN_MULTICAST;
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002316 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002317 return -EINVAL;
2318
2319 if (dev_out->flags & IFF_LOOPBACK)
2320 flags |= RTCF_LOCAL;
2321
2322 /* get work reference to inet device */
2323 in_dev = in_dev_get(dev_out);
2324 if (!in_dev)
2325 return -EINVAL;
2326
2327 if (res->type == RTN_BROADCAST) {
2328 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2329 if (res->fi) {
2330 fib_info_put(res->fi);
2331 res->fi = NULL;
2332 }
2333 } else if (res->type == RTN_MULTICAST) {
2334 flags |= RTCF_MULTICAST|RTCF_LOCAL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002335 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002336 oldflp->proto))
2337 flags &= ~RTCF_LOCAL;
2338 /* If multicast route do not exist use
2339 default one, but do not gateway in this case.
2340 Yes, it is hack.
2341 */
2342 if (res->fi && res->prefixlen < 4) {
2343 fib_info_put(res->fi);
2344 res->fi = NULL;
2345 }
2346 }
2347
2348
2349 rth = dst_alloc(&ipv4_dst_ops);
2350 if (!rth) {
2351 err = -ENOBUFS;
2352 goto cleanup;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002353 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002354
Julian Anastasovce723d82005-09-08 13:34:47 -07002355 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002356 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07002357 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002358 rth->u.dst.flags |= DST_NOXFRM;
Herbert Xu42f811b2007-06-04 23:34:44 -07002359 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002360 rth->u.dst.flags |= DST_NOPOLICY;
2361
2362 rth->fl.fl4_dst = oldflp->fl4_dst;
2363 rth->fl.fl4_tos = tos;
2364 rth->fl.fl4_src = oldflp->fl4_src;
2365 rth->fl.oif = oldflp->oif;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002366 rth->fl.mark = oldflp->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002367 rth->rt_dst = fl->fl4_dst;
2368 rth->rt_src = fl->fl4_src;
2369 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002370 /* get references to the devices that are to be hold by the routing
Linus Torvalds1da177e2005-04-16 15:20:36 -07002371 cache entry */
2372 rth->u.dst.dev = dev_out;
2373 dev_hold(dev_out);
2374 rth->idev = in_dev_get(dev_out);
2375 rth->rt_gateway = fl->fl4_dst;
2376 rth->rt_spec_dst= fl->fl4_src;
2377
2378 rth->u.dst.output=ip_output;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002379 rth->rt_genid = rt_genid(dev_net(dev_out));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002380
2381 RT_CACHE_STAT_INC(out_slow_tot);
2382
2383 if (flags & RTCF_LOCAL) {
2384 rth->u.dst.input = ip_local_deliver;
2385 rth->rt_spec_dst = fl->fl4_dst;
2386 }
2387 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2388 rth->rt_spec_dst = fl->fl4_src;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002389 if (flags & RTCF_LOCAL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002390 !(dev_out->flags & IFF_LOOPBACK)) {
2391 rth->u.dst.output = ip_mc_output;
2392 RT_CACHE_STAT_INC(out_slow_mc);
2393 }
2394#ifdef CONFIG_IP_MROUTE
2395 if (res->type == RTN_MULTICAST) {
2396 if (IN_DEV_MFORWARD(in_dev) &&
Joe Perchesf97c1e02007-12-16 13:45:43 -08002397 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002398 rth->u.dst.input = ip_mr_input;
2399 rth->u.dst.output = ip_mc_output;
2400 }
2401 }
2402#endif
2403 }
2404
2405 rt_set_nexthop(rth, res, 0);
2406
2407 rth->rt_flags = flags;
2408
2409 *result = rth;
2410 cleanup:
2411 /* release work reference to inet device */
2412 in_dev_put(in_dev);
2413
2414 return err;
2415}
2416
Stephen Hemminger5969f712008-04-10 01:52:09 -07002417static int ip_mkroute_output(struct rtable **rp,
2418 struct fib_result *res,
2419 const struct flowi *fl,
2420 const struct flowi *oldflp,
2421 struct net_device *dev_out,
2422 unsigned flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002423{
Chuck Short7abaa272005-06-22 22:10:23 -07002424 struct rtable *rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002425 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2426 unsigned hash;
2427 if (err == 0) {
Denis V. Lunevb00180d2008-07-05 19:04:09 -07002428 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002429 rt_genid(dev_net(dev_out)));
Eric Dumazet511c3f92009-06-02 05:14:27 +00002430 err = rt_intern_hash(hash, rth, rp, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002431 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002432
Linus Torvalds1da177e2005-04-16 15:20:36 -07002433 return err;
2434}
2435
Linus Torvalds1da177e2005-04-16 15:20:36 -07002436/*
2437 * Major route resolver routine.
2438 */
2439
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002440static int ip_route_output_slow(struct net *net, struct rtable **rp,
2441 const struct flowi *oldflp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002442{
2443 u32 tos = RT_FL_TOS(oldflp);
2444 struct flowi fl = { .nl_u = { .ip4_u =
2445 { .daddr = oldflp->fl4_dst,
2446 .saddr = oldflp->fl4_src,
2447 .tos = tos & IPTOS_RT_MASK,
2448 .scope = ((tos & RTO_ONLINK) ?
2449 RT_SCOPE_LINK :
2450 RT_SCOPE_UNIVERSE),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002451 } },
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002452 .mark = oldflp->mark,
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002453 .iif = net->loopback_dev->ifindex,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002454 .oif = oldflp->oif };
2455 struct fib_result res;
2456 unsigned flags = 0;
2457 struct net_device *dev_out = NULL;
2458 int free_res = 0;
2459 int err;
2460
2461
2462 res.fi = NULL;
2463#ifdef CONFIG_IP_MULTIPLE_TABLES
2464 res.r = NULL;
2465#endif
2466
2467 if (oldflp->fl4_src) {
2468 err = -EINVAL;
Joe Perchesf97c1e02007-12-16 13:45:43 -08002469 if (ipv4_is_multicast(oldflp->fl4_src) ||
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002470 ipv4_is_lbcast(oldflp->fl4_src) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002471 ipv4_is_zeronet(oldflp->fl4_src))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002472 goto out;
2473
Linus Torvalds1da177e2005-04-16 15:20:36 -07002474 /* I removed check for oif == dev_out->oif here.
2475 It was wrong for two reasons:
Denis V. Lunev1ab35272008-01-22 22:04:30 -08002476 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2477 is assigned to multiple interfaces.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002478 2. Moreover, we are allowed to send packets with saddr
2479 of another iface. --ANK
2480 */
2481
David S. Millerf6c5d732007-05-18 02:07:50 -07002482 if (oldflp->oif == 0
Joe Perchesf97c1e02007-12-16 13:45:43 -08002483 && (ipv4_is_multicast(oldflp->fl4_dst) ||
2484 oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
Julian Anastasova210d012008-10-01 07:28:28 -07002485 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2486 dev_out = ip_dev_find(net, oldflp->fl4_src);
2487 if (dev_out == NULL)
2488 goto out;
2489
Linus Torvalds1da177e2005-04-16 15:20:36 -07002490 /* Special hack: user can direct multicasts
2491 and limited broadcast via necessary interface
2492 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2493 This hack is not just for fun, it allows
2494 vic,vat and friends to work.
2495 They bind socket to loopback, set ttl to zero
2496 and expect that it will work.
2497 From the viewpoint of routing cache they are broken,
2498 because we are not allowed to build multicast path
2499 with loopback source addr (look, routing cache
2500 cannot know, that ttl is zero, so that packet
2501 will not leave this host and route is valid).
2502 Luckily, this hack is good workaround.
2503 */
2504
2505 fl.oif = dev_out->ifindex;
2506 goto make_route;
2507 }
Julian Anastasova210d012008-10-01 07:28:28 -07002508
2509 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2510 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2511 dev_out = ip_dev_find(net, oldflp->fl4_src);
2512 if (dev_out == NULL)
2513 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002514 dev_put(dev_out);
Julian Anastasova210d012008-10-01 07:28:28 -07002515 dev_out = NULL;
2516 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002517 }
2518
2519
2520 if (oldflp->oif) {
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002521 dev_out = dev_get_by_index(net, oldflp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002522 err = -ENODEV;
2523 if (dev_out == NULL)
2524 goto out;
Herbert Xue5ed6392005-10-03 14:35:55 -07002525
2526 /* RACE: Check return value of inet_select_addr instead. */
2527 if (__in_dev_get_rtnl(dev_out) == NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002528 dev_put(dev_out);
2529 goto out; /* Wrong error code */
2530 }
2531
Joe Perchesf97c1e02007-12-16 13:45:43 -08002532 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2533 oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002534 if (!fl.fl4_src)
2535 fl.fl4_src = inet_select_addr(dev_out, 0,
2536 RT_SCOPE_LINK);
2537 goto make_route;
2538 }
2539 if (!fl.fl4_src) {
Joe Perchesf97c1e02007-12-16 13:45:43 -08002540 if (ipv4_is_multicast(oldflp->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002541 fl.fl4_src = inet_select_addr(dev_out, 0,
2542 fl.fl4_scope);
2543 else if (!oldflp->fl4_dst)
2544 fl.fl4_src = inet_select_addr(dev_out, 0,
2545 RT_SCOPE_HOST);
2546 }
2547 }
2548
2549 if (!fl.fl4_dst) {
2550 fl.fl4_dst = fl.fl4_src;
2551 if (!fl.fl4_dst)
2552 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2553 if (dev_out)
2554 dev_put(dev_out);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002555 dev_out = net->loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002556 dev_hold(dev_out);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002557 fl.oif = net->loopback_dev->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002558 res.type = RTN_LOCAL;
2559 flags |= RTCF_LOCAL;
2560 goto make_route;
2561 }
2562
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002563 if (fib_lookup(net, &fl, &res)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002564 res.fi = NULL;
2565 if (oldflp->oif) {
2566 /* Apparently, routing tables are wrong. Assume,
2567 that the destination is on link.
2568
2569 WHY? DW.
2570 Because we are allowed to send to iface
2571 even if it has NO routes and NO assigned
2572 addresses. When oif is specified, routing
2573 tables are looked up with only one purpose:
2574 to catch if destination is gatewayed, rather than
2575 direct. Moreover, if MSG_DONTROUTE is set,
2576 we send packet, ignoring both routing tables
2577 and ifaddr state. --ANK
2578
2579
2580 We could make it even if oif is unknown,
2581 likely IPv6, but we do not.
2582 */
2583
2584 if (fl.fl4_src == 0)
2585 fl.fl4_src = inet_select_addr(dev_out, 0,
2586 RT_SCOPE_LINK);
2587 res.type = RTN_UNICAST;
2588 goto make_route;
2589 }
2590 if (dev_out)
2591 dev_put(dev_out);
2592 err = -ENETUNREACH;
2593 goto out;
2594 }
2595 free_res = 1;
2596
2597 if (res.type == RTN_LOCAL) {
2598 if (!fl.fl4_src)
2599 fl.fl4_src = fl.fl4_dst;
2600 if (dev_out)
2601 dev_put(dev_out);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002602 dev_out = net->loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002603 dev_hold(dev_out);
2604 fl.oif = dev_out->ifindex;
2605 if (res.fi)
2606 fib_info_put(res.fi);
2607 res.fi = NULL;
2608 flags |= RTCF_LOCAL;
2609 goto make_route;
2610 }
2611
2612#ifdef CONFIG_IP_ROUTE_MULTIPATH
2613 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2614 fib_select_multipath(&fl, &res);
2615 else
2616#endif
2617 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002618 fib_select_default(net, &fl, &res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002619
2620 if (!fl.fl4_src)
2621 fl.fl4_src = FIB_RES_PREFSRC(res);
2622
2623 if (dev_out)
2624 dev_put(dev_out);
2625 dev_out = FIB_RES_DEV(res);
2626 dev_hold(dev_out);
2627 fl.oif = dev_out->ifindex;
2628
2629
2630make_route:
2631 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2632
2633
2634 if (free_res)
2635 fib_res_put(&res);
2636 if (dev_out)
2637 dev_put(dev_out);
2638out: return err;
2639}
2640
Denis V. Lunev611c1832008-01-22 22:06:48 -08002641int __ip_route_output_key(struct net *net, struct rtable **rp,
2642 const struct flowi *flp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002643{
2644 unsigned hash;
2645 struct rtable *rth;
2646
Neil Horman1080d702008-10-27 12:28:25 -07002647 if (!rt_caching(net))
2648 goto slow_output;
2649
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002650 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002651
2652 rcu_read_lock_bh();
2653 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08002654 rth = rcu_dereference(rth->u.dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002655 if (rth->fl.fl4_dst == flp->fl4_dst &&
2656 rth->fl.fl4_src == flp->fl4_src &&
2657 rth->fl.iif == 0 &&
2658 rth->fl.oif == flp->oif &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002659 rth->fl.mark == flp->mark &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002660 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
Denis V. Lunevb5921912008-01-22 23:50:25 -08002661 (IPTOS_RT_MASK | RTO_ONLINK)) &&
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09002662 net_eq(dev_net(rth->u.dst.dev), net) &&
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002663 !rt_is_expired(rth)) {
Pavel Emelyanov03f49f32007-11-10 21:28:34 -08002664 dst_use(&rth->u.dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002665 RT_CACHE_STAT_INC(out_hit);
2666 rcu_read_unlock_bh();
2667 *rp = rth;
2668 return 0;
2669 }
2670 RT_CACHE_STAT_INC(out_hlist_search);
2671 }
2672 rcu_read_unlock_bh();
2673
Neil Horman1080d702008-10-27 12:28:25 -07002674slow_output:
Denis V. Lunev611c1832008-01-22 22:06:48 -08002675 return ip_route_output_slow(net, rp, flp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002676}
2677
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002678EXPORT_SYMBOL_GPL(__ip_route_output_key);
2679
David S. Miller14e50e52007-05-24 18:17:54 -07002680static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2681{
2682}
2683
2684static struct dst_ops ipv4_dst_blackhole_ops = {
2685 .family = AF_INET,
Harvey Harrison09640e62009-02-01 00:45:17 -08002686 .protocol = cpu_to_be16(ETH_P_IP),
David S. Miller14e50e52007-05-24 18:17:54 -07002687 .destroy = ipv4_dst_destroy,
2688 .check = ipv4_dst_check,
2689 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
Eric Dumazete2422972008-01-30 20:07:45 -08002690 .entries = ATOMIC_INIT(0),
David S. Miller14e50e52007-05-24 18:17:54 -07002691};
2692
2693
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002694static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
David S. Miller14e50e52007-05-24 18:17:54 -07002695{
2696 struct rtable *ort = *rp;
2697 struct rtable *rt = (struct rtable *)
2698 dst_alloc(&ipv4_dst_blackhole_ops);
2699
2700 if (rt) {
2701 struct dst_entry *new = &rt->u.dst;
2702
2703 atomic_set(&new->__refcnt, 1);
2704 new->__use = 1;
Herbert Xu352e5122007-11-13 21:34:06 -08002705 new->input = dst_discard;
2706 new->output = dst_discard;
David S. Miller14e50e52007-05-24 18:17:54 -07002707 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2708
2709 new->dev = ort->u.dst.dev;
2710 if (new->dev)
2711 dev_hold(new->dev);
2712
2713 rt->fl = ort->fl;
2714
2715 rt->idev = ort->idev;
2716 if (rt->idev)
2717 in_dev_hold(rt->idev);
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002718 rt->rt_genid = rt_genid(net);
David S. Miller14e50e52007-05-24 18:17:54 -07002719 rt->rt_flags = ort->rt_flags;
2720 rt->rt_type = ort->rt_type;
2721 rt->rt_dst = ort->rt_dst;
2722 rt->rt_src = ort->rt_src;
2723 rt->rt_iif = ort->rt_iif;
2724 rt->rt_gateway = ort->rt_gateway;
2725 rt->rt_spec_dst = ort->rt_spec_dst;
2726 rt->peer = ort->peer;
2727 if (rt->peer)
2728 atomic_inc(&rt->peer->refcnt);
2729
2730 dst_free(new);
2731 }
2732
2733 dst_release(&(*rp)->u.dst);
2734 *rp = rt;
2735 return (rt ? 0 : -ENOMEM);
2736}
2737
Denis V. Lunevf1b050b2008-01-22 22:07:10 -08002738int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2739 struct sock *sk, int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002740{
2741 int err;
2742
Denis V. Lunevf1b050b2008-01-22 22:07:10 -08002743 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002744 return err;
2745
2746 if (flp->proto) {
2747 if (!flp->fl4_src)
2748 flp->fl4_src = (*rp)->rt_src;
2749 if (!flp->fl4_dst)
2750 flp->fl4_dst = (*rp)->rt_dst;
Alexey Dobriyan52479b62008-11-25 17:35:18 -08002751 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
Herbert Xubb728452007-12-12 18:48:58 -08002752 flags ? XFRM_LOOKUP_WAIT : 0);
David S. Miller14e50e52007-05-24 18:17:54 -07002753 if (err == -EREMOTE)
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002754 err = ipv4_dst_blackhole(net, rp, flp);
David S. Miller14e50e52007-05-24 18:17:54 -07002755
2756 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002757 }
2758
2759 return 0;
2760}
2761
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002762EXPORT_SYMBOL_GPL(ip_route_output_flow);
2763
Denis V. Lunevf2063512008-01-22 22:07:34 -08002764int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002765{
Denis V. Lunevf2063512008-01-22 22:07:34 -08002766 return ip_route_output_flow(net, rp, flp, NULL, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002767}
2768
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002769static int rt_fill_info(struct net *net,
2770 struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002771 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002772{
Eric Dumazet511c3f92009-06-02 05:14:27 +00002773 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002774 struct rtmsg *r;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002775 struct nlmsghdr *nlh;
Thomas Grafe3703b32006-11-27 09:27:07 -08002776 long expires;
2777 u32 id = 0, ts = 0, tsage = 0, error;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002778
2779 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2780 if (nlh == NULL)
Patrick McHardy26932562007-01-31 23:16:40 -08002781 return -EMSGSIZE;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002782
2783 r = nlmsg_data(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002784 r->rtm_family = AF_INET;
2785 r->rtm_dst_len = 32;
2786 r->rtm_src_len = 0;
2787 r->rtm_tos = rt->fl.fl4_tos;
2788 r->rtm_table = RT_TABLE_MAIN;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002789 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002790 r->rtm_type = rt->rt_type;
2791 r->rtm_scope = RT_SCOPE_UNIVERSE;
2792 r->rtm_protocol = RTPROT_UNSPEC;
2793 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2794 if (rt->rt_flags & RTCF_NOTIFY)
2795 r->rtm_flags |= RTM_F_NOTIFY;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002796
Al Viro17fb2c62006-09-26 22:15:25 -07002797 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002798
Linus Torvalds1da177e2005-04-16 15:20:36 -07002799 if (rt->fl.fl4_src) {
2800 r->rtm_src_len = 32;
Al Viro17fb2c62006-09-26 22:15:25 -07002801 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002802 }
2803 if (rt->u.dst.dev)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002804 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002805#ifdef CONFIG_NET_CLS_ROUTE
2806 if (rt->u.dst.tclassid)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002807 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002808#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002809 if (rt->fl.iif)
Al Viro17fb2c62006-09-26 22:15:25 -07002810 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002811 else if (rt->rt_src != rt->fl.fl4_src)
Al Viro17fb2c62006-09-26 22:15:25 -07002812 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002813
Linus Torvalds1da177e2005-04-16 15:20:36 -07002814 if (rt->rt_dst != rt->rt_gateway)
Al Viro17fb2c62006-09-26 22:15:25 -07002815 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002816
Linus Torvalds1da177e2005-04-16 15:20:36 -07002817 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002818 goto nla_put_failure;
2819
Thomas Grafe3703b32006-11-27 09:27:07 -08002820 error = rt->u.dst.error;
2821 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002822 if (rt->peer) {
Thomas Grafe3703b32006-11-27 09:27:07 -08002823 id = rt->peer->ip_id_count;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002824 if (rt->peer->tcp_ts_stamp) {
Thomas Grafe3703b32006-11-27 09:27:07 -08002825 ts = rt->peer->tcp_ts;
James Morris9d729f72007-03-04 16:12:44 -08002826 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002827 }
2828 }
Thomas Grafbe403ea2006-08-17 18:15:17 -07002829
Linus Torvalds1da177e2005-04-16 15:20:36 -07002830 if (rt->fl.iif) {
2831#ifdef CONFIG_IP_MROUTE
Al Viroe4485152006-09-26 22:15:01 -07002832 __be32 dst = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002833
Joe Perchesf97c1e02007-12-16 13:45:43 -08002834 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002835 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2836 int err = ipmr_get_route(net, skb, r, nowait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002837 if (err <= 0) {
2838 if (!nowait) {
2839 if (err == 0)
2840 return 0;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002841 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002842 } else {
2843 if (err == -EMSGSIZE)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002844 goto nla_put_failure;
Thomas Grafe3703b32006-11-27 09:27:07 -08002845 error = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002846 }
2847 }
2848 } else
2849#endif
Thomas Grafbe403ea2006-08-17 18:15:17 -07002850 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002851 }
2852
Thomas Grafe3703b32006-11-27 09:27:07 -08002853 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2854 expires, error) < 0)
2855 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002856
Thomas Grafbe403ea2006-08-17 18:15:17 -07002857 return nlmsg_end(skb, nlh);
2858
2859nla_put_failure:
Patrick McHardy26932562007-01-31 23:16:40 -08002860 nlmsg_cancel(skb, nlh);
2861 return -EMSGSIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002862}
2863
Thomas Graf63f34442007-03-22 11:55:17 -07002864static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002865{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09002866 struct net *net = sock_net(in_skb->sk);
Thomas Grafd889ce32006-08-17 18:15:44 -07002867 struct rtmsg *rtm;
2868 struct nlattr *tb[RTA_MAX+1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07002869 struct rtable *rt = NULL;
Al Viro9e12bb22006-09-26 21:25:20 -07002870 __be32 dst = 0;
2871 __be32 src = 0;
2872 u32 iif;
Thomas Grafd889ce32006-08-17 18:15:44 -07002873 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002874 struct sk_buff *skb;
2875
Thomas Grafd889ce32006-08-17 18:15:44 -07002876 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2877 if (err < 0)
2878 goto errout;
2879
2880 rtm = nlmsg_data(nlh);
2881
Linus Torvalds1da177e2005-04-16 15:20:36 -07002882 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
Thomas Grafd889ce32006-08-17 18:15:44 -07002883 if (skb == NULL) {
2884 err = -ENOBUFS;
2885 goto errout;
2886 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002887
2888 /* Reserve room for dummy headers, this skb can pass
2889 through good chunk of routing engine.
2890 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07002891 skb_reset_mac_header(skb);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07002892 skb_reset_network_header(skb);
Stephen Hemmingerd2c962b2006-04-17 17:27:11 -07002893
2894 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07002895 ip_hdr(skb)->protocol = IPPROTO_ICMP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002896 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2897
Al Viro17fb2c62006-09-26 22:15:25 -07002898 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2899 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
Thomas Grafd889ce32006-08-17 18:15:44 -07002900 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002901
2902 if (iif) {
Thomas Grafd889ce32006-08-17 18:15:44 -07002903 struct net_device *dev;
2904
Denis V. Lunev19375042008-02-28 20:52:04 -08002905 dev = __dev_get_by_index(net, iif);
Thomas Grafd889ce32006-08-17 18:15:44 -07002906 if (dev == NULL) {
2907 err = -ENODEV;
2908 goto errout_free;
2909 }
2910
Linus Torvalds1da177e2005-04-16 15:20:36 -07002911 skb->protocol = htons(ETH_P_IP);
2912 skb->dev = dev;
2913 local_bh_disable();
2914 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2915 local_bh_enable();
Thomas Grafd889ce32006-08-17 18:15:44 -07002916
Eric Dumazet511c3f92009-06-02 05:14:27 +00002917 rt = skb_rtable(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07002918 if (err == 0 && rt->u.dst.error)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002919 err = -rt->u.dst.error;
2920 } else {
Thomas Grafd889ce32006-08-17 18:15:44 -07002921 struct flowi fl = {
2922 .nl_u = {
2923 .ip4_u = {
2924 .daddr = dst,
2925 .saddr = src,
2926 .tos = rtm->rtm_tos,
2927 },
2928 },
2929 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2930 };
Denis V. Lunev19375042008-02-28 20:52:04 -08002931 err = ip_route_output_key(net, &rt, &fl);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002932 }
Thomas Grafd889ce32006-08-17 18:15:44 -07002933
Linus Torvalds1da177e2005-04-16 15:20:36 -07002934 if (err)
Thomas Grafd889ce32006-08-17 18:15:44 -07002935 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002936
Eric Dumazetadf30902009-06-02 05:19:30 +00002937 skb_dst_set(skb, &rt->u.dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002938 if (rtm->rtm_flags & RTM_F_NOTIFY)
2939 rt->rt_flags |= RTCF_NOTIFY;
2940
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002941 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Denis V. Lunev19375042008-02-28 20:52:04 -08002942 RTM_NEWROUTE, 0, 0);
Thomas Grafd889ce32006-08-17 18:15:44 -07002943 if (err <= 0)
2944 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002945
Denis V. Lunev19375042008-02-28 20:52:04 -08002946 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
Thomas Grafd889ce32006-08-17 18:15:44 -07002947errout:
Thomas Graf2942e902006-08-15 00:30:25 -07002948 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002949
Thomas Grafd889ce32006-08-17 18:15:44 -07002950errout_free:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002951 kfree_skb(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07002952 goto errout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002953}
2954
2955int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2956{
2957 struct rtable *rt;
2958 int h, s_h;
2959 int idx, s_idx;
Denis V. Lunev19375042008-02-28 20:52:04 -08002960 struct net *net;
2961
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09002962 net = sock_net(skb->sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002963
2964 s_h = cb->args[0];
Eric Dumazetd8c92832008-01-07 21:52:14 -08002965 if (s_h < 0)
2966 s_h = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002967 s_idx = idx = cb->args[1];
Eric Dumazeta6272662008-08-28 01:11:25 -07002968 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
2969 if (!rt_hash_table[h].chain)
2970 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002971 rcu_read_lock_bh();
2972 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08002973 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09002974 if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002975 continue;
Denis V. Luneve84f84f2008-07-05 19:04:32 -07002976 if (rt_is_expired(rt))
Eric Dumazet29e75252008-01-31 17:05:09 -08002977 continue;
Eric Dumazetadf30902009-06-02 05:19:30 +00002978 skb_dst_set(skb, dst_clone(&rt->u.dst));
Benjamin Thery4feb88e2009-01-22 04:56:23 +00002979 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002980 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002981 1, NLM_F_MULTI) <= 0) {
Eric Dumazetadf30902009-06-02 05:19:30 +00002982 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002983 rcu_read_unlock_bh();
2984 goto done;
2985 }
Eric Dumazetadf30902009-06-02 05:19:30 +00002986 skb_dst_drop(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002987 }
2988 rcu_read_unlock_bh();
2989 }
2990
2991done:
2992 cb->args[0] = h;
2993 cb->args[1] = idx;
2994 return skb->len;
2995}
2996
2997void ip_rt_multicast_event(struct in_device *in_dev)
2998{
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -07002999 rt_cache_flush(dev_net(in_dev->dev), 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003000}
3001
3002#ifdef CONFIG_SYSCTL
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003003static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003004 struct file *filp, void __user *buffer,
3005 size_t *lenp, loff_t *ppos)
3006{
3007 if (write) {
Denis V. Lunev639e1042008-07-05 19:02:06 -07003008 int flush_delay;
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003009 ctl_table ctl;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003010 struct net *net;
Denis V. Lunev639e1042008-07-05 19:02:06 -07003011
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003012 memcpy(&ctl, __ctl, sizeof(ctl));
3013 ctl.data = &flush_delay;
3014 proc_dointvec(&ctl, write, filp, buffer, lenp, ppos);
Denis V. Lunev639e1042008-07-05 19:02:06 -07003015
Denis V. Lunev81c684d2008-07-08 03:05:28 -07003016 net = (struct net *)__ctl->extra1;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003017 rt_cache_flush(net, flush_delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003018 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003019 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003020
3021 return -EINVAL;
3022}
3023
3024static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003025 void __user *oldval,
3026 size_t __user *oldlenp,
3027 void __user *newval,
Alexey Dobriyan1f29bcd2006-12-10 02:19:10 -08003028 size_t newlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003029{
3030 int delay;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003031 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003032 if (newlen != sizeof(int))
3033 return -EINVAL;
3034 if (get_user(delay, (int __user *)newval))
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003035 return -EFAULT;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003036 net = (struct net *)table->extra1;
3037 rt_cache_flush(net, delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003038 return 0;
3039}
3040
Herbert Xuc6153b52008-08-15 13:44:31 -07003041static void rt_secret_reschedule(int old)
3042{
3043 struct net *net;
3044 int new = ip_rt_secret_interval;
3045 int diff = new - old;
3046
3047 if (!diff)
3048 return;
3049
3050 rtnl_lock();
3051 for_each_net(net) {
3052 int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
3053
3054 if (!new)
3055 continue;
3056
3057 if (deleted) {
3058 long time = net->ipv4.rt_secret_timer.expires - jiffies;
3059
3060 if (time <= 0 || (time += diff) <= 0)
3061 time = 0;
3062
3063 net->ipv4.rt_secret_timer.expires = time;
3064 } else
3065 net->ipv4.rt_secret_timer.expires = new;
3066
3067 net->ipv4.rt_secret_timer.expires += jiffies;
3068 add_timer(&net->ipv4.rt_secret_timer);
3069 }
3070 rtnl_unlock();
3071}
3072
3073static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
3074 struct file *filp,
3075 void __user *buffer, size_t *lenp,
3076 loff_t *ppos)
3077{
3078 int old = ip_rt_secret_interval;
3079 int ret = proc_dointvec_jiffies(ctl, write, filp, buffer, lenp, ppos);
3080
3081 rt_secret_reschedule(old);
3082
3083 return ret;
3084}
3085
3086static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table *table,
Herbert Xuc6153b52008-08-15 13:44:31 -07003087 void __user *oldval,
3088 size_t __user *oldlenp,
3089 void __user *newval,
3090 size_t newlen)
3091{
3092 int old = ip_rt_secret_interval;
Alexey Dobriyanf221e722008-10-15 22:04:23 -07003093 int ret = sysctl_jiffies(table, oldval, oldlenp, newval, newlen);
Herbert Xuc6153b52008-08-15 13:44:31 -07003094
3095 rt_secret_reschedule(old);
3096
3097 return ret;
3098}
3099
Al Viroeeb61f72008-07-27 08:59:33 +01003100static ctl_table ipv4_route_table[] = {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003101 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003102 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
3103 .procname = "gc_thresh",
3104 .data = &ipv4_dst_ops.gc_thresh,
3105 .maxlen = sizeof(int),
3106 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003107 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003108 },
3109 {
3110 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
3111 .procname = "max_size",
3112 .data = &ip_rt_max_size,
3113 .maxlen = sizeof(int),
3114 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003115 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003116 },
3117 {
3118 /* Deprecated. Use gc_min_interval_ms */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09003119
Linus Torvalds1da177e2005-04-16 15:20:36 -07003120 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
3121 .procname = "gc_min_interval",
3122 .data = &ip_rt_gc_min_interval,
3123 .maxlen = sizeof(int),
3124 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003125 .proc_handler = proc_dointvec_jiffies,
3126 .strategy = sysctl_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003127 },
3128 {
3129 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
3130 .procname = "gc_min_interval_ms",
3131 .data = &ip_rt_gc_min_interval,
3132 .maxlen = sizeof(int),
3133 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003134 .proc_handler = proc_dointvec_ms_jiffies,
3135 .strategy = sysctl_ms_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003136 },
3137 {
3138 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
3139 .procname = "gc_timeout",
3140 .data = &ip_rt_gc_timeout,
3141 .maxlen = sizeof(int),
3142 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003143 .proc_handler = proc_dointvec_jiffies,
3144 .strategy = sysctl_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003145 },
3146 {
3147 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
3148 .procname = "gc_interval",
3149 .data = &ip_rt_gc_interval,
3150 .maxlen = sizeof(int),
3151 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003152 .proc_handler = proc_dointvec_jiffies,
3153 .strategy = sysctl_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003154 },
3155 {
3156 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
3157 .procname = "redirect_load",
3158 .data = &ip_rt_redirect_load,
3159 .maxlen = sizeof(int),
3160 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003161 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003162 },
3163 {
3164 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
3165 .procname = "redirect_number",
3166 .data = &ip_rt_redirect_number,
3167 .maxlen = sizeof(int),
3168 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003169 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003170 },
3171 {
3172 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
3173 .procname = "redirect_silence",
3174 .data = &ip_rt_redirect_silence,
3175 .maxlen = sizeof(int),
3176 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003177 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003178 },
3179 {
3180 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
3181 .procname = "error_cost",
3182 .data = &ip_rt_error_cost,
3183 .maxlen = sizeof(int),
3184 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003185 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003186 },
3187 {
3188 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
3189 .procname = "error_burst",
3190 .data = &ip_rt_error_burst,
3191 .maxlen = sizeof(int),
3192 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003193 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003194 },
3195 {
3196 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
3197 .procname = "gc_elasticity",
3198 .data = &ip_rt_gc_elasticity,
3199 .maxlen = sizeof(int),
3200 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003201 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003202 },
3203 {
3204 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
3205 .procname = "mtu_expires",
3206 .data = &ip_rt_mtu_expires,
3207 .maxlen = sizeof(int),
3208 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003209 .proc_handler = proc_dointvec_jiffies,
3210 .strategy = sysctl_jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003211 },
3212 {
3213 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
3214 .procname = "min_pmtu",
3215 .data = &ip_rt_min_pmtu,
3216 .maxlen = sizeof(int),
3217 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003218 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003219 },
3220 {
3221 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
3222 .procname = "min_adv_mss",
3223 .data = &ip_rt_min_advmss,
3224 .maxlen = sizeof(int),
3225 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003226 .proc_handler = proc_dointvec,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003227 },
3228 {
3229 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3230 .procname = "secret_interval",
3231 .data = &ip_rt_secret_interval,
3232 .maxlen = sizeof(int),
3233 .mode = 0644,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003234 .proc_handler = ipv4_sysctl_rt_secret_interval,
3235 .strategy = ipv4_sysctl_rt_secret_interval_strategy,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003236 },
3237 { .ctl_name = 0 }
3238};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003239
Al Viro2f4520d2008-08-25 15:17:44 -07003240static struct ctl_table empty[1];
3241
3242static struct ctl_table ipv4_skeleton[] =
3243{
3244 { .procname = "route", .ctl_name = NET_IPV4_ROUTE,
Hugh Dickinsd994af02008-08-27 02:35:18 -07003245 .mode = 0555, .child = ipv4_route_table},
Al Viro2f4520d2008-08-25 15:17:44 -07003246 { .procname = "neigh", .ctl_name = NET_IPV4_NEIGH,
Hugh Dickinsd994af02008-08-27 02:35:18 -07003247 .mode = 0555, .child = empty},
Al Viro2f4520d2008-08-25 15:17:44 -07003248 { }
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003249};
3250
Al Viro2f4520d2008-08-25 15:17:44 -07003251static __net_initdata struct ctl_path ipv4_path[] = {
3252 { .procname = "net", .ctl_name = CTL_NET, },
3253 { .procname = "ipv4", .ctl_name = NET_IPV4, },
3254 { },
3255};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003256
3257static struct ctl_table ipv4_route_flush_table[] = {
3258 {
3259 .ctl_name = NET_IPV4_ROUTE_FLUSH,
3260 .procname = "flush",
3261 .maxlen = sizeof(int),
3262 .mode = 0200,
Alexey Dobriyan6d9f2392008-11-03 18:21:05 -08003263 .proc_handler = ipv4_sysctl_rtcache_flush,
3264 .strategy = ipv4_sysctl_rtcache_flush_strategy,
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003265 },
3266 { .ctl_name = 0 },
3267};
3268
Al Viro2f4520d2008-08-25 15:17:44 -07003269static __net_initdata struct ctl_path ipv4_route_path[] = {
3270 { .procname = "net", .ctl_name = CTL_NET, },
3271 { .procname = "ipv4", .ctl_name = NET_IPV4, },
3272 { .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
3273 { },
3274};
3275
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003276static __net_init int sysctl_route_net_init(struct net *net)
3277{
3278 struct ctl_table *tbl;
3279
3280 tbl = ipv4_route_flush_table;
3281 if (net != &init_net) {
3282 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3283 if (tbl == NULL)
3284 goto err_dup;
3285 }
3286 tbl[0].extra1 = net;
3287
3288 net->ipv4.route_hdr =
3289 register_net_sysctl_table(net, ipv4_route_path, tbl);
3290 if (net->ipv4.route_hdr == NULL)
3291 goto err_reg;
3292 return 0;
3293
3294err_reg:
3295 if (tbl != ipv4_route_flush_table)
3296 kfree(tbl);
3297err_dup:
3298 return -ENOMEM;
3299}
3300
3301static __net_exit void sysctl_route_net_exit(struct net *net)
3302{
3303 struct ctl_table *tbl;
3304
3305 tbl = net->ipv4.route_hdr->ctl_table_arg;
3306 unregister_net_sysctl_table(net->ipv4.route_hdr);
3307 BUG_ON(tbl == ipv4_route_flush_table);
3308 kfree(tbl);
3309}
3310
3311static __net_initdata struct pernet_operations sysctl_route_ops = {
3312 .init = sysctl_route_net_init,
3313 .exit = sysctl_route_net_exit,
3314};
Linus Torvalds1da177e2005-04-16 15:20:36 -07003315#endif
3316
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003317
3318static __net_init int rt_secret_timer_init(struct net *net)
3319{
Denis V. Luneve84f84f2008-07-05 19:04:32 -07003320 atomic_set(&net->ipv4.rt_genid,
3321 (int) ((num_physpages ^ (num_physpages>>8)) ^
3322 (jiffies ^ (jiffies >> 7))));
3323
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003324 net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3325 net->ipv4.rt_secret_timer.data = (unsigned long)net;
3326 init_timer_deferrable(&net->ipv4.rt_secret_timer);
3327
Herbert Xuc6153b52008-08-15 13:44:31 -07003328 if (ip_rt_secret_interval) {
3329 net->ipv4.rt_secret_timer.expires =
3330 jiffies + net_random() % ip_rt_secret_interval +
3331 ip_rt_secret_interval;
3332 add_timer(&net->ipv4.rt_secret_timer);
3333 }
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003334 return 0;
3335}
3336
3337static __net_exit void rt_secret_timer_exit(struct net *net)
3338{
3339 del_timer_sync(&net->ipv4.rt_secret_timer);
3340}
3341
3342static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3343 .init = rt_secret_timer_init,
3344 .exit = rt_secret_timer_exit,
3345};
3346
3347
Linus Torvalds1da177e2005-04-16 15:20:36 -07003348#ifdef CONFIG_NET_CLS_ROUTE
Eric Dumazet8dbde282007-11-16 03:32:10 -08003349struct ip_rt_acct *ip_rt_acct __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003350#endif /* CONFIG_NET_CLS_ROUTE */
3351
3352static __initdata unsigned long rhash_entries;
3353static int __init set_rhash_entries(char *str)
3354{
3355 if (!str)
3356 return 0;
3357 rhash_entries = simple_strtoul(str, &str, 0);
3358 return 1;
3359}
3360__setup("rhash_entries=", set_rhash_entries);
3361
3362int __init ip_rt_init(void)
3363{
Eric Dumazet424c4b72005-07-05 14:58:19 -07003364 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003365
Linus Torvalds1da177e2005-04-16 15:20:36 -07003366#ifdef CONFIG_NET_CLS_ROUTE
Ingo Molnar0dcec8c2009-02-25 14:07:33 +01003367 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003368 if (!ip_rt_acct)
3369 panic("IP: failed to allocate ip_rt_acct\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003370#endif
3371
Alexey Dobriyane5d679f332006-08-26 19:25:52 -07003372 ipv4_dst_ops.kmem_cachep =
3373 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09003374 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003375
David S. Miller14e50e52007-05-24 18:17:54 -07003376 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3377
Eric Dumazet424c4b72005-07-05 14:58:19 -07003378 rt_hash_table = (struct rt_hash_bucket *)
3379 alloc_large_system_hash("IP route cache",
3380 sizeof(struct rt_hash_bucket),
3381 rhash_entries,
3382 (num_physpages >= 128 * 1024) ?
Mike Stroyan18955cf2005-11-29 16:12:55 -08003383 15 : 17,
Kirill Korotaev8d1502d2006-08-07 20:44:22 -07003384 0,
Eric Dumazet424c4b72005-07-05 14:58:19 -07003385 &rt_hash_log,
3386 &rt_hash_mask,
Anton Blanchardc9503e02009-04-27 05:42:24 -07003387 rhash_entries ? 0 : 512 * 1024);
Eric Dumazet22c047c2005-07-05 14:55:24 -07003388 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3389 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003390
3391 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3392 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3393
Linus Torvalds1da177e2005-04-16 15:20:36 -07003394 devinet_init();
3395 ip_fib_init();
3396
Linus Torvalds1da177e2005-04-16 15:20:36 -07003397 /* All the timers, started at system startup tend
3398 to synchronize. Perturb it a bit.
3399 */
Eric Dumazet39c90ec2007-09-15 10:55:54 -07003400 schedule_delayed_work(&expires_work,
3401 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003402
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003403 if (register_pernet_subsys(&rt_secret_timer_ops))
3404 printk(KERN_ERR "Unable to setup rt_secret_timer\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003405
Denis V. Lunev73b38712008-02-28 20:51:18 -08003406 if (ip_rt_proc_init())
Pavel Emelyanov107f1632007-12-05 21:14:28 -08003407 printk(KERN_ERR "Unable to create route proc files\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003408#ifdef CONFIG_XFRM
3409 xfrm_init();
3410 xfrm4_init();
3411#endif
Thomas Graf63f34442007-03-22 11:55:17 -07003412 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3413
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003414#ifdef CONFIG_SYSCTL
3415 register_pernet_subsys(&sysctl_route_ops);
3416#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07003417 return rc;
3418}
3419
Al Viroa1bc6eb2008-07-30 06:32:52 -04003420#ifdef CONFIG_SYSCTL
Al Viroeeb61f72008-07-27 08:59:33 +01003421/*
3422 * We really need to sanitize the damn ipv4 init order, then all
3423 * this nonsense will go away.
3424 */
3425void __init ip_static_sysctl_init(void)
3426{
Al Viro2f4520d2008-08-25 15:17:44 -07003427 register_sysctl_paths(ipv4_path, ipv4_skeleton);
Al Viroeeb61f72008-07-27 08:59:33 +01003428}
Al Viroa1bc6eb2008-07-30 06:32:52 -04003429#endif
Al Viroeeb61f72008-07-27 08:59:33 +01003430
Linus Torvalds1da177e2005-04-16 15:20:36 -07003431EXPORT_SYMBOL(__ip_select_ident);
3432EXPORT_SYMBOL(ip_route_input);
3433EXPORT_SYMBOL(ip_route_output_key);