blob: 780e9484c825e96c9994bf7ea2ac8cb96026c400 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9 *
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 *
16 * Fixes:
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090023 * Alan Cox : Super /proc >4K
Linus Torvalds1da177e2005-04-16 15:20:36 -070024 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
26 * clamper.
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090041 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070042 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070057 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Ilia Sotnikovcef26852006-03-25 01:38:55 -080058 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
59 * Ilia Sotnikov : Removed TOS from hash calculations
Linus Torvalds1da177e2005-04-16 15:20:36 -070060 *
61 * This program is free software; you can redistribute it and/or
62 * modify it under the terms of the GNU General Public License
63 * as published by the Free Software Foundation; either version
64 * 2 of the License, or (at your option) any later version.
65 */
66
Linus Torvalds1da177e2005-04-16 15:20:36 -070067#include <linux/module.h>
68#include <asm/uaccess.h>
69#include <asm/system.h>
70#include <linux/bitops.h>
71#include <linux/types.h>
72#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070073#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070074#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070075#include <linux/string.h>
76#include <linux/socket.h>
77#include <linux/sockios.h>
78#include <linux/errno.h>
79#include <linux/in.h>
80#include <linux/inet.h>
81#include <linux/netdevice.h>
82#include <linux/proc_fs.h>
83#include <linux/init.h>
Eric Dumazet39c90ec2007-09-15 10:55:54 -070084#include <linux/workqueue.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070085#include <linux/skbuff.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070086#include <linux/inetdevice.h>
87#include <linux/igmp.h>
88#include <linux/pkt_sched.h>
89#include <linux/mroute.h>
90#include <linux/netfilter_ipv4.h>
91#include <linux/random.h>
92#include <linux/jhash.h>
93#include <linux/rcupdate.h>
94#include <linux/times.h>
Herbert Xu352e5122007-11-13 21:34:06 -080095#include <net/dst.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020096#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070097#include <net/protocol.h>
98#include <net/ip.h>
99#include <net/route.h>
100#include <net/inetpeer.h>
101#include <net/sock.h>
102#include <net/ip_fib.h>
103#include <net/arp.h>
104#include <net/tcp.h>
105#include <net/icmp.h>
106#include <net/xfrm.h>
Tom Tucker8d717402006-07-30 20:43:36 -0700107#include <net/netevent.h>
Thomas Graf63f34442007-03-22 11:55:17 -0700108#include <net/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700109#ifdef CONFIG_SYSCTL
110#include <linux/sysctl.h>
111#endif
112
113#define RT_FL_TOS(oldflp) \
114 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115
116#define IP_MAX_MTU 0xFFF0
117
118#define RT_GC_TIMEOUT (300*HZ)
119
Linus Torvalds1da177e2005-04-16 15:20:36 -0700120static int ip_rt_max_size;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700121static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
122static int ip_rt_gc_interval __read_mostly = 60 * HZ;
123static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
124static int ip_rt_redirect_number __read_mostly = 9;
125static int ip_rt_redirect_load __read_mostly = HZ / 50;
126static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
127static int ip_rt_error_cost __read_mostly = HZ;
128static int ip_rt_error_burst __read_mostly = 5 * HZ;
129static int ip_rt_gc_elasticity __read_mostly = 8;
130static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
131static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
132static int ip_rt_min_advmss __read_mostly = 256;
133static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700134
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800135static void rt_worker_func(struct work_struct *work);
136static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700137static struct timer_list rt_secret_timer;
138
139/*
140 * Interface to generic destination cache.
141 */
142
143static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
144static void ipv4_dst_destroy(struct dst_entry *dst);
145static void ipv4_dst_ifdown(struct dst_entry *dst,
146 struct net_device *dev, int how);
147static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
148static void ipv4_link_failure(struct sk_buff *skb);
149static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
Daniel Lezcano569d3642008-01-18 03:56:57 -0800150static int rt_garbage_collect(struct dst_ops *ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700151
152
153static struct dst_ops ipv4_dst_ops = {
154 .family = AF_INET,
155 .protocol = __constant_htons(ETH_P_IP),
156 .gc = rt_garbage_collect,
157 .check = ipv4_dst_check,
158 .destroy = ipv4_dst_destroy,
159 .ifdown = ipv4_dst_ifdown,
160 .negative_advice = ipv4_negative_advice,
161 .link_failure = ipv4_link_failure,
162 .update_pmtu = ip_rt_update_pmtu,
Herbert Xu862b82c2007-11-13 21:43:11 -0800163 .local_out = ip_local_out,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700164 .entry_size = sizeof(struct rtable),
Eric Dumazete2422972008-01-30 20:07:45 -0800165 .entries = ATOMIC_INIT(0),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700166};
167
168#define ECN_OR_COST(class) TC_PRIO_##class
169
Philippe De Muyter4839c522007-07-09 15:32:57 -0700170const __u8 ip_tos2prio[16] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700171 TC_PRIO_BESTEFFORT,
172 ECN_OR_COST(FILLER),
173 TC_PRIO_BESTEFFORT,
174 ECN_OR_COST(BESTEFFORT),
175 TC_PRIO_BULK,
176 ECN_OR_COST(BULK),
177 TC_PRIO_BULK,
178 ECN_OR_COST(BULK),
179 TC_PRIO_INTERACTIVE,
180 ECN_OR_COST(INTERACTIVE),
181 TC_PRIO_INTERACTIVE,
182 ECN_OR_COST(INTERACTIVE),
183 TC_PRIO_INTERACTIVE_BULK,
184 ECN_OR_COST(INTERACTIVE_BULK),
185 TC_PRIO_INTERACTIVE_BULK,
186 ECN_OR_COST(INTERACTIVE_BULK)
187};
188
189
190/*
191 * Route cache.
192 */
193
194/* The locking scheme is rather straight forward:
195 *
196 * 1) Read-Copy Update protects the buckets of the central route hash.
197 * 2) Only writers remove entries, and they hold the lock
198 * as they look at rtable reference counts.
199 * 3) Only readers acquire references to rtable entries,
200 * they do so with atomic increments and with the
201 * lock held.
202 */
203
204struct rt_hash_bucket {
205 struct rtable *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700206};
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700207#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
208 defined(CONFIG_PROVE_LOCKING)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700209/*
210 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
211 * The size of this table is a power of two and depends on the number of CPUS.
Ingo Molnar62051202006-07-03 00:24:59 -0700212 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700213 */
Ingo Molnar62051202006-07-03 00:24:59 -0700214#ifdef CONFIG_LOCKDEP
215# define RT_HASH_LOCK_SZ 256
Eric Dumazet22c047c2005-07-05 14:55:24 -0700216#else
Ingo Molnar62051202006-07-03 00:24:59 -0700217# if NR_CPUS >= 32
218# define RT_HASH_LOCK_SZ 4096
219# elif NR_CPUS >= 16
220# define RT_HASH_LOCK_SZ 2048
221# elif NR_CPUS >= 8
222# define RT_HASH_LOCK_SZ 1024
223# elif NR_CPUS >= 4
224# define RT_HASH_LOCK_SZ 512
225# else
226# define RT_HASH_LOCK_SZ 256
227# endif
Eric Dumazet22c047c2005-07-05 14:55:24 -0700228#endif
229
230static spinlock_t *rt_hash_locks;
231# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800232
233static __init void rt_hash_lock_init(void)
234{
235 int i;
236
237 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
238 GFP_KERNEL);
239 if (!rt_hash_locks)
240 panic("IP: failed to allocate rt_hash_locks\n");
241
242 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
243 spin_lock_init(&rt_hash_locks[i]);
244}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700245#else
246# define rt_hash_lock_addr(slot) NULL
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800247
248static inline void rt_hash_lock_init(void)
249{
250}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700251#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700252
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700253static struct rt_hash_bucket *rt_hash_table __read_mostly;
254static unsigned rt_hash_mask __read_mostly;
255static unsigned int rt_hash_log __read_mostly;
256static atomic_t rt_genid __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700257
Eric Dumazet2f970d82006-01-17 02:54:36 -0800258static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
Andrew Mortondbd29152006-01-17 21:58:01 -0800259#define RT_CACHE_STAT_INC(field) \
Paul Mackerrasbfe5d832006-06-25 05:47:14 -0700260 (__raw_get_cpu_var(rt_cache_stat).field++)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700261
Stephen Hemminger1294fc42008-04-10 01:54:01 -0700262static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700263{
Stephen Hemminger1294fc42008-04-10 01:54:01 -0700264 return jhash_3words((__force u32)(__be32)(daddr),
265 (__force u32)(__be32)(saddr),
266 idx, atomic_read(&rt_genid))
Eric Dumazet29e75252008-01-31 17:05:09 -0800267 & rt_hash_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700268}
269
270#ifdef CONFIG_PROC_FS
271struct rt_cache_iter_state {
Denis V. Luneva75e9362008-02-28 20:50:55 -0800272 struct seq_net_private p;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700273 int bucket;
Eric Dumazet29e75252008-01-31 17:05:09 -0800274 int genid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700275};
276
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900277static struct rtable *rt_cache_get_first(struct seq_file *seq)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700278{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900279 struct rt_cache_iter_state *st = seq->private;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700280 struct rtable *r = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700281
282 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
283 rcu_read_lock_bh();
Eric Dumazet29e75252008-01-31 17:05:09 -0800284 r = rcu_dereference(rt_hash_table[st->bucket].chain);
285 while (r) {
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900286 if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
Denis V. Luneva75e9362008-02-28 20:50:55 -0800287 r->rt_genid == st->genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800288 return r;
289 r = rcu_dereference(r->u.dst.rt_next);
290 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700291 rcu_read_unlock_bh();
292 }
Eric Dumazet29e75252008-01-31 17:05:09 -0800293 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700294}
295
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900296static struct rtable *__rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800297 struct rtable *r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700298{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900299 struct rt_cache_iter_state *st = seq->private;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800300 r = r->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700301 while (!r) {
302 rcu_read_unlock_bh();
303 if (--st->bucket < 0)
304 break;
305 rcu_read_lock_bh();
306 r = rt_hash_table[st->bucket].chain;
307 }
Eric Dumazet0bccead2008-01-10 03:55:57 -0800308 return rcu_dereference(r);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700309}
310
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900311static struct rtable *rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800312 struct rtable *r)
313{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900314 struct rt_cache_iter_state *st = seq->private;
315 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
316 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
Denis V. Luneva75e9362008-02-28 20:50:55 -0800317 continue;
Denis V. Lunev642d6312008-02-28 20:50:33 -0800318 if (r->rt_genid == st->genid)
319 break;
320 }
321 return r;
322}
323
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900324static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700325{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900326 struct rtable *r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700327
328 if (r)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900329 while (pos && (r = rt_cache_get_next(seq, r)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700330 --pos;
331 return pos ? NULL : r;
332}
333
334static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
335{
Eric Dumazet29e75252008-01-31 17:05:09 -0800336 struct rt_cache_iter_state *st = seq->private;
Eric Dumazet29e75252008-01-31 17:05:09 -0800337 if (*pos)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900338 return rt_cache_get_idx(seq, *pos - 1);
Eric Dumazet29e75252008-01-31 17:05:09 -0800339 st->genid = atomic_read(&rt_genid);
340 return SEQ_START_TOKEN;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700341}
342
343static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
344{
Eric Dumazet29e75252008-01-31 17:05:09 -0800345 struct rtable *r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700346
347 if (v == SEQ_START_TOKEN)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900348 r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700349 else
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900350 r = rt_cache_get_next(seq, v);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700351 ++*pos;
352 return r;
353}
354
355static void rt_cache_seq_stop(struct seq_file *seq, void *v)
356{
357 if (v && v != SEQ_START_TOKEN)
358 rcu_read_unlock_bh();
359}
360
361static int rt_cache_seq_show(struct seq_file *seq, void *v)
362{
363 if (v == SEQ_START_TOKEN)
364 seq_printf(seq, "%-127s\n",
365 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
366 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
367 "HHUptod\tSpecDst");
368 else {
369 struct rtable *r = v;
370 char temp[256];
371
372 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
373 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
374 r->u.dst.dev ? r->u.dst.dev->name : "*",
375 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
376 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
377 r->u.dst.__use, 0, (unsigned long)r->rt_src,
378 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
379 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
380 dst_metric(&r->u.dst, RTAX_WINDOW),
381 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
382 dst_metric(&r->u.dst, RTAX_RTTVAR)),
383 r->fl.fl4_tos,
384 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
385 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
386 dev_queue_xmit) : 0,
387 r->rt_spec_dst);
388 seq_printf(seq, "%-127s\n", temp);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900389 }
390 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700391}
392
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700393static const struct seq_operations rt_cache_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700394 .start = rt_cache_seq_start,
395 .next = rt_cache_seq_next,
396 .stop = rt_cache_seq_stop,
397 .show = rt_cache_seq_show,
398};
399
400static int rt_cache_seq_open(struct inode *inode, struct file *file)
401{
Denis V. Luneva75e9362008-02-28 20:50:55 -0800402 return seq_open_net(inode, file, &rt_cache_seq_ops,
Pavel Emelyanovcf7732e2007-10-10 02:29:29 -0700403 sizeof(struct rt_cache_iter_state));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700404}
405
Arjan van de Ven9a321442007-02-12 00:55:35 -0800406static const struct file_operations rt_cache_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700407 .owner = THIS_MODULE,
408 .open = rt_cache_seq_open,
409 .read = seq_read,
410 .llseek = seq_lseek,
Denis V. Luneva75e9362008-02-28 20:50:55 -0800411 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700412};
413
414
415static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
416{
417 int cpu;
418
419 if (*pos == 0)
420 return SEQ_START_TOKEN;
421
422 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
423 if (!cpu_possible(cpu))
424 continue;
425 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800426 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700427 }
428 return NULL;
429}
430
431static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
432{
433 int cpu;
434
435 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
436 if (!cpu_possible(cpu))
437 continue;
438 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800439 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700440 }
441 return NULL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900442
Linus Torvalds1da177e2005-04-16 15:20:36 -0700443}
444
445static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
446{
447
448}
449
450static int rt_cpu_seq_show(struct seq_file *seq, void *v)
451{
452 struct rt_cache_stat *st = v;
453
454 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700455 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700456 return 0;
457 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900458
Linus Torvalds1da177e2005-04-16 15:20:36 -0700459 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
460 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
461 atomic_read(&ipv4_dst_ops.entries),
462 st->in_hit,
463 st->in_slow_tot,
464 st->in_slow_mc,
465 st->in_no_route,
466 st->in_brd,
467 st->in_martian_dst,
468 st->in_martian_src,
469
470 st->out_hit,
471 st->out_slow_tot,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900472 st->out_slow_mc,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700473
474 st->gc_total,
475 st->gc_ignored,
476 st->gc_goal_miss,
477 st->gc_dst_overflow,
478 st->in_hlist_search,
479 st->out_hlist_search
480 );
481 return 0;
482}
483
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700484static const struct seq_operations rt_cpu_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700485 .start = rt_cpu_seq_start,
486 .next = rt_cpu_seq_next,
487 .stop = rt_cpu_seq_stop,
488 .show = rt_cpu_seq_show,
489};
490
491
492static int rt_cpu_seq_open(struct inode *inode, struct file *file)
493{
494 return seq_open(file, &rt_cpu_seq_ops);
495}
496
Arjan van de Ven9a321442007-02-12 00:55:35 -0800497static const struct file_operations rt_cpu_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700498 .owner = THIS_MODULE,
499 .open = rt_cpu_seq_open,
500 .read = seq_read,
501 .llseek = seq_lseek,
502 .release = seq_release,
503};
504
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800505#ifdef CONFIG_NET_CLS_ROUTE
506static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
507 int length, int *eof, void *data)
508{
509 unsigned int i;
510
511 if ((offset & 3) || (length & 3))
512 return -EIO;
513
514 if (offset >= sizeof(struct ip_rt_acct) * 256) {
515 *eof = 1;
516 return 0;
517 }
518
519 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
520 length = sizeof(struct ip_rt_acct) * 256 - offset;
521 *eof = 1;
522 }
523
524 offset /= sizeof(u32);
525
526 if (length > 0) {
527 u32 *dst = (u32 *) buffer;
528
529 *start = buffer;
530 memset(dst, 0, length);
531
532 for_each_possible_cpu(i) {
533 unsigned int j;
534 u32 *src;
535
536 src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
537 for (j = 0; j < length/4; j++)
538 dst[j] += src[j];
539 }
540 }
541 return length;
542}
543#endif
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800544
Denis V. Lunev73b38712008-02-28 20:51:18 -0800545static int __net_init ip_rt_do_proc_init(struct net *net)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800546{
547 struct proc_dir_entry *pde;
548
549 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
550 &rt_cache_seq_fops);
551 if (!pde)
552 goto err1;
553
Wang Chen77020722008-02-28 14:14:25 -0800554 pde = proc_create("rt_cache", S_IRUGO,
555 net->proc_net_stat, &rt_cpu_seq_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800556 if (!pde)
557 goto err2;
558
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800559#ifdef CONFIG_NET_CLS_ROUTE
560 pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
561 ip_rt_acct_read, NULL);
562 if (!pde)
563 goto err3;
564#endif
565 return 0;
566
567#ifdef CONFIG_NET_CLS_ROUTE
568err3:
569 remove_proc_entry("rt_cache", net->proc_net_stat);
570#endif
571err2:
572 remove_proc_entry("rt_cache", net->proc_net);
573err1:
574 return -ENOMEM;
575}
Denis V. Lunev73b38712008-02-28 20:51:18 -0800576
577static void __net_exit ip_rt_do_proc_exit(struct net *net)
578{
579 remove_proc_entry("rt_cache", net->proc_net_stat);
580 remove_proc_entry("rt_cache", net->proc_net);
581 remove_proc_entry("rt_acct", net->proc_net);
582}
583
584static struct pernet_operations ip_rt_proc_ops __net_initdata = {
585 .init = ip_rt_do_proc_init,
586 .exit = ip_rt_do_proc_exit,
587};
588
589static int __init ip_rt_proc_init(void)
590{
591 return register_pernet_subsys(&ip_rt_proc_ops);
592}
593
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800594#else
Denis V. Lunev73b38712008-02-28 20:51:18 -0800595static inline int ip_rt_proc_init(void)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800596{
597 return 0;
598}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700599#endif /* CONFIG_PROC_FS */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900600
Stephen Hemminger5969f712008-04-10 01:52:09 -0700601static inline void rt_free(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700602{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700603 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
604}
605
Stephen Hemminger5969f712008-04-10 01:52:09 -0700606static inline void rt_drop(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700607{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700608 ip_rt_put(rt);
609 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
610}
611
Stephen Hemminger5969f712008-04-10 01:52:09 -0700612static inline int rt_fast_clean(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700613{
614 /* Kill broadcast/multicast entries very aggresively, if they
615 collide in hash table with more useful entries */
616 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800617 rth->fl.iif && rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700618}
619
Stephen Hemminger5969f712008-04-10 01:52:09 -0700620static inline int rt_valuable(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700621{
622 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
623 rth->u.dst.expires;
624}
625
626static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
627{
628 unsigned long age;
629 int ret = 0;
630
631 if (atomic_read(&rth->u.dst.__refcnt))
632 goto out;
633
634 ret = 1;
635 if (rth->u.dst.expires &&
636 time_after_eq(jiffies, rth->u.dst.expires))
637 goto out;
638
639 age = jiffies - rth->u.dst.lastuse;
640 ret = 0;
641 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
642 (age <= tmo2 && rt_valuable(rth)))
643 goto out;
644 ret = 1;
645out: return ret;
646}
647
648/* Bits of score are:
649 * 31: very valuable
650 * 30: not quite useless
651 * 29..0: usage counter
652 */
653static inline u32 rt_score(struct rtable *rt)
654{
655 u32 score = jiffies - rt->u.dst.lastuse;
656
657 score = ~score & ~(3<<30);
658
659 if (rt_valuable(rt))
660 score |= (1<<31);
661
662 if (!rt->fl.iif ||
663 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
664 score |= (1<<30);
665
666 return score;
667}
668
669static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
670{
Al Viro714e85b2006-11-14 20:51:49 -0800671 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
672 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
Thomas Graf47dcf0c2006-11-09 15:20:38 -0800673 (fl1->mark ^ fl2->mark) |
David S. Miller8238b212006-10-12 00:49:15 -0700674 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
675 *(u16 *)&fl2->nl_u.ip4_u.tos) |
676 (fl1->oif ^ fl2->oif) |
677 (fl1->iif ^ fl2->iif)) == 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700678}
679
Denis V. Lunevb5921912008-01-22 23:50:25 -0800680static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
681{
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900682 return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
Denis V. Lunevb5921912008-01-22 23:50:25 -0800683}
684
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800685/*
686 * Perform a full scan of hash table and free all entries.
687 * Can be called by a softirq or a process.
688 * In the later case, we want to be reschedule if necessary
689 */
690static void rt_do_flush(int process_context)
691{
692 unsigned int i;
693 struct rtable *rth, *next;
694
695 for (i = 0; i <= rt_hash_mask; i++) {
696 if (process_context && need_resched())
697 cond_resched();
698 rth = rt_hash_table[i].chain;
699 if (!rth)
700 continue;
701
702 spin_lock_bh(rt_hash_lock_addr(i));
703 rth = rt_hash_table[i].chain;
704 rt_hash_table[i].chain = NULL;
705 spin_unlock_bh(rt_hash_lock_addr(i));
706
707 for (; rth; rth = next) {
708 next = rth->u.dst.rt_next;
709 rt_free(rth);
710 }
711 }
712}
713
714static void rt_check_expire(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700715{
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700716 static unsigned int rover;
717 unsigned int i = rover, goal;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700718 struct rtable *rth, **rthp;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700719 u64 mult;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700720
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700721 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
722 if (ip_rt_gc_timeout > 1)
723 do_div(mult, ip_rt_gc_timeout);
724 goal = (unsigned int)mult;
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700725 if (goal > rt_hash_mask)
726 goal = rt_hash_mask + 1;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700727 for (; goal > 0; goal--) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700728 unsigned long tmo = ip_rt_gc_timeout;
729
730 i = (i + 1) & rt_hash_mask;
731 rthp = &rt_hash_table[i].chain;
732
Eric Dumazetd90bf5a2007-11-14 16:14:05 -0800733 if (need_resched())
734 cond_resched();
735
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700736 if (*rthp == NULL)
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700737 continue;
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700738 spin_lock_bh(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700739 while ((rth = *rthp) != NULL) {
Eric Dumazet29e75252008-01-31 17:05:09 -0800740 if (rth->rt_genid != atomic_read(&rt_genid)) {
741 *rthp = rth->u.dst.rt_next;
742 rt_free(rth);
743 continue;
744 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700745 if (rth->u.dst.expires) {
746 /* Entry is expired even if it is in use */
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700747 if (time_before_eq(jiffies, rth->u.dst.expires)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700748 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800749 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700750 continue;
751 }
752 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
753 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800754 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700755 continue;
756 }
757
758 /* Cleanup aged off entries. */
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800759 *rthp = rth->u.dst.rt_next;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900760 rt_free(rth);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700761 }
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700762 spin_unlock_bh(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700763 }
764 rover = i;
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800765}
766
767/*
768 * rt_worker_func() is run in process context.
Eric Dumazet29e75252008-01-31 17:05:09 -0800769 * we call rt_check_expire() to scan part of the hash table
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800770 */
771static void rt_worker_func(struct work_struct *work)
772{
Eric Dumazet29e75252008-01-31 17:05:09 -0800773 rt_check_expire();
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700774 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700775}
776
Eric Dumazet29e75252008-01-31 17:05:09 -0800777/*
778 * Pertubation of rt_genid by a small quantity [1..256]
779 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
780 * many times (2^24) without giving recent rt_genid.
781 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700782 */
Eric Dumazet29e75252008-01-31 17:05:09 -0800783static void rt_cache_invalidate(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700784{
Eric Dumazet29e75252008-01-31 17:05:09 -0800785 unsigned char shuffle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700786
Eric Dumazet29e75252008-01-31 17:05:09 -0800787 get_random_bytes(&shuffle, sizeof(shuffle));
788 atomic_add(shuffle + 1U, &rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700789}
790
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800791/*
Eric Dumazet29e75252008-01-31 17:05:09 -0800792 * delay < 0 : invalidate cache (fast : entries will be deleted later)
793 * delay >= 0 : invalidate & flush cache (can be long)
794 */
795void rt_cache_flush(int delay)
796{
797 rt_cache_invalidate();
798 if (delay >= 0)
799 rt_do_flush(!in_softirq());
800}
801
802/*
803 * We change rt_genid and let gc do the cleanup
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800804 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700805static void rt_secret_rebuild(unsigned long dummy)
806{
Eric Dumazet29e75252008-01-31 17:05:09 -0800807 rt_cache_invalidate();
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800808 mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700809}
810
811/*
812 Short description of GC goals.
813
814 We want to build algorithm, which will keep routing cache
815 at some equilibrium point, when number of aged off entries
816 is kept approximately equal to newly generated ones.
817
818 Current expiration strength is variable "expire".
819 We try to adjust it dynamically, so that if networking
820 is idle expires is large enough to keep enough of warm entries,
821 and when load increases it reduces to limit cache size.
822 */
823
Daniel Lezcano569d3642008-01-18 03:56:57 -0800824static int rt_garbage_collect(struct dst_ops *ops)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700825{
826 static unsigned long expire = RT_GC_TIMEOUT;
827 static unsigned long last_gc;
828 static int rover;
829 static int equilibrium;
830 struct rtable *rth, **rthp;
831 unsigned long now = jiffies;
832 int goal;
833
834 /*
835 * Garbage collection is pretty expensive,
836 * do not make it too frequently.
837 */
838
839 RT_CACHE_STAT_INC(gc_total);
840
841 if (now - last_gc < ip_rt_gc_min_interval &&
842 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
843 RT_CACHE_STAT_INC(gc_ignored);
844 goto out;
845 }
846
847 /* Calculate number of entries, which we want to expire now. */
848 goal = atomic_read(&ipv4_dst_ops.entries) -
849 (ip_rt_gc_elasticity << rt_hash_log);
850 if (goal <= 0) {
851 if (equilibrium < ipv4_dst_ops.gc_thresh)
852 equilibrium = ipv4_dst_ops.gc_thresh;
853 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
854 if (goal > 0) {
Eric Dumazetb790ced2007-12-21 01:49:07 -0800855 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700856 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
857 }
858 } else {
859 /* We are in dangerous area. Try to reduce cache really
860 * aggressively.
861 */
Eric Dumazetb790ced2007-12-21 01:49:07 -0800862 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700863 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
864 }
865
866 if (now - last_gc >= ip_rt_gc_min_interval)
867 last_gc = now;
868
869 if (goal <= 0) {
870 equilibrium += goal;
871 goto work_done;
872 }
873
874 do {
875 int i, k;
876
877 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
878 unsigned long tmo = expire;
879
880 k = (k + 1) & rt_hash_mask;
881 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700882 spin_lock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700883 while ((rth = *rthp) != NULL) {
Eric Dumazet29e75252008-01-31 17:05:09 -0800884 if (rth->rt_genid == atomic_read(&rt_genid) &&
885 !rt_may_expire(rth, tmo, expire)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700886 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800887 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700888 continue;
889 }
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800890 *rthp = rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700891 rt_free(rth);
892 goal--;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700893 }
Eric Dumazet22c047c2005-07-05 14:55:24 -0700894 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700895 if (goal <= 0)
896 break;
897 }
898 rover = k;
899
900 if (goal <= 0)
901 goto work_done;
902
903 /* Goal is not achieved. We stop process if:
904
905 - if expire reduced to zero. Otherwise, expire is halfed.
906 - if table is not full.
907 - if we are called from interrupt.
908 - jiffies check is just fallback/debug loop breaker.
909 We will not spin here for long time in any case.
910 */
911
912 RT_CACHE_STAT_INC(gc_goal_miss);
913
914 if (expire == 0)
915 break;
916
917 expire >>= 1;
918#if RT_CACHE_DEBUG >= 2
919 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
920 atomic_read(&ipv4_dst_ops.entries), goal, i);
921#endif
922
923 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
924 goto out;
925 } while (!in_softirq() && time_before_eq(jiffies, now));
926
927 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
928 goto out;
929 if (net_ratelimit())
930 printk(KERN_WARNING "dst cache overflow\n");
931 RT_CACHE_STAT_INC(gc_dst_overflow);
932 return 1;
933
934work_done:
935 expire += ip_rt_gc_min_interval;
936 if (expire > ip_rt_gc_timeout ||
937 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
938 expire = ip_rt_gc_timeout;
939#if RT_CACHE_DEBUG >= 2
940 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
941 atomic_read(&ipv4_dst_ops.entries), goal, rover);
942#endif
943out: return 0;
944}
945
946static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
947{
948 struct rtable *rth, **rthp;
949 unsigned long now;
950 struct rtable *cand, **candp;
951 u32 min_score;
952 int chain_length;
953 int attempts = !in_softirq();
954
955restart:
956 chain_length = 0;
957 min_score = ~(u32)0;
958 cand = NULL;
959 candp = NULL;
960 now = jiffies;
961
962 rthp = &rt_hash_table[hash].chain;
963
Eric Dumazet22c047c2005-07-05 14:55:24 -0700964 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700965 while ((rth = *rthp) != NULL) {
Eric Dumazet29e75252008-01-31 17:05:09 -0800966 if (rth->rt_genid != atomic_read(&rt_genid)) {
967 *rthp = rth->u.dst.rt_next;
968 rt_free(rth);
969 continue;
970 }
Denis V. Lunevb5921912008-01-22 23:50:25 -0800971 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700972 /* Put it first */
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800973 *rthp = rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700974 /*
975 * Since lookup is lockfree, the deletion
976 * must be visible to another weakly ordered CPU before
977 * the insertion at the start of the hash chain.
978 */
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800979 rcu_assign_pointer(rth->u.dst.rt_next,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700980 rt_hash_table[hash].chain);
981 /*
982 * Since lookup is lockfree, the update writes
983 * must be ordered for consistency on SMP.
984 */
985 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
986
Pavel Emelyanov03f49f32007-11-10 21:28:34 -0800987 dst_use(&rth->u.dst, now);
Eric Dumazet22c047c2005-07-05 14:55:24 -0700988 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700989
990 rt_drop(rt);
991 *rp = rth;
992 return 0;
993 }
994
995 if (!atomic_read(&rth->u.dst.__refcnt)) {
996 u32 score = rt_score(rth);
997
998 if (score <= min_score) {
999 cand = rth;
1000 candp = rthp;
1001 min_score = score;
1002 }
1003 }
1004
1005 chain_length++;
1006
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001007 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001008 }
1009
1010 if (cand) {
1011 /* ip_rt_gc_elasticity used to be average length of chain
1012 * length, when exceeded gc becomes really aggressive.
1013 *
1014 * The second limit is less certain. At the moment it allows
1015 * only 2 entries per bucket. We will see.
1016 */
1017 if (chain_length > ip_rt_gc_elasticity) {
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001018 *candp = cand->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001019 rt_free(cand);
1020 }
1021 }
1022
1023 /* Try to bind route to arp only if it is output
1024 route or unicast forwarding path.
1025 */
1026 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1027 int err = arp_bind_neighbour(&rt->u.dst);
1028 if (err) {
Eric Dumazet22c047c2005-07-05 14:55:24 -07001029 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001030
1031 if (err != -ENOBUFS) {
1032 rt_drop(rt);
1033 return err;
1034 }
1035
1036 /* Neighbour tables are full and nothing
1037 can be released. Try to shrink route cache,
1038 it is most likely it holds some neighbour records.
1039 */
1040 if (attempts-- > 0) {
1041 int saved_elasticity = ip_rt_gc_elasticity;
1042 int saved_int = ip_rt_gc_min_interval;
1043 ip_rt_gc_elasticity = 1;
1044 ip_rt_gc_min_interval = 0;
Daniel Lezcano569d3642008-01-18 03:56:57 -08001045 rt_garbage_collect(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001046 ip_rt_gc_min_interval = saved_int;
1047 ip_rt_gc_elasticity = saved_elasticity;
1048 goto restart;
1049 }
1050
1051 if (net_ratelimit())
1052 printk(KERN_WARNING "Neighbour table overflow.\n");
1053 rt_drop(rt);
1054 return -ENOBUFS;
1055 }
1056 }
1057
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001058 rt->u.dst.rt_next = rt_hash_table[hash].chain;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001059#if RT_CACHE_DEBUG >= 2
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001060 if (rt->u.dst.rt_next) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001061 struct rtable *trt;
YOSHIFUJI Hideakia7d632b2008-04-14 04:09:00 -07001062 printk(KERN_DEBUG "rt_cache @%02x: " NIPQUAD_FMT, hash,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001063 NIPQUAD(rt->rt_dst));
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001064 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
YOSHIFUJI Hideakia7d632b2008-04-14 04:09:00 -07001065 printk(" . " NIPQUAD_FMT, NIPQUAD(trt->rt_dst));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001066 printk("\n");
1067 }
1068#endif
1069 rt_hash_table[hash].chain = rt;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001070 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001071 *rp = rt;
1072 return 0;
1073}
1074
1075void rt_bind_peer(struct rtable *rt, int create)
1076{
1077 static DEFINE_SPINLOCK(rt_peer_lock);
1078 struct inet_peer *peer;
1079
1080 peer = inet_getpeer(rt->rt_dst, create);
1081
1082 spin_lock_bh(&rt_peer_lock);
1083 if (rt->peer == NULL) {
1084 rt->peer = peer;
1085 peer = NULL;
1086 }
1087 spin_unlock_bh(&rt_peer_lock);
1088 if (peer)
1089 inet_putpeer(peer);
1090}
1091
1092/*
1093 * Peer allocation may fail only in serious out-of-memory conditions. However
1094 * we still can generate some output.
1095 * Random ID selection looks a bit dangerous because we have no chances to
1096 * select ID being unique in a reasonable period of time.
1097 * But broken packet identifier may be better than no packet at all.
1098 */
1099static void ip_select_fb_ident(struct iphdr *iph)
1100{
1101 static DEFINE_SPINLOCK(ip_fb_id_lock);
1102 static u32 ip_fallback_id;
1103 u32 salt;
1104
1105 spin_lock_bh(&ip_fb_id_lock);
Al Viroe4485152006-09-26 22:15:01 -07001106 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001107 iph->id = htons(salt & 0xFFFF);
1108 ip_fallback_id = salt;
1109 spin_unlock_bh(&ip_fb_id_lock);
1110}
1111
1112void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1113{
1114 struct rtable *rt = (struct rtable *) dst;
1115
1116 if (rt) {
1117 if (rt->peer == NULL)
1118 rt_bind_peer(rt, 1);
1119
1120 /* If peer is attached to destination, it is never detached,
1121 so that we need not to grab a lock to dereference it.
1122 */
1123 if (rt->peer) {
1124 iph->id = htons(inet_getid(rt->peer, more));
1125 return;
1126 }
1127 } else
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001128 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
Stephen Hemminger9c2b3322005-04-19 22:39:42 -07001129 __builtin_return_address(0));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001130
1131 ip_select_fb_ident(iph);
1132}
1133
1134static void rt_del(unsigned hash, struct rtable *rt)
1135{
Eric Dumazet29e75252008-01-31 17:05:09 -08001136 struct rtable **rthp, *aux;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001137
Eric Dumazet29e75252008-01-31 17:05:09 -08001138 rthp = &rt_hash_table[hash].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001139 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001140 ip_rt_put(rt);
Eric Dumazet29e75252008-01-31 17:05:09 -08001141 while ((aux = *rthp) != NULL) {
1142 if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
1143 *rthp = aux->u.dst.rt_next;
1144 rt_free(aux);
1145 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001146 }
Eric Dumazet29e75252008-01-31 17:05:09 -08001147 rthp = &aux->u.dst.rt_next;
1148 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001149 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001150}
1151
Al Virof7655222006-09-26 21:25:43 -07001152void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1153 __be32 saddr, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001154{
1155 int i, k;
1156 struct in_device *in_dev = in_dev_get(dev);
1157 struct rtable *rth, **rthp;
Al Virof7655222006-09-26 21:25:43 -07001158 __be32 skeys[2] = { saddr, 0 };
Linus Torvalds1da177e2005-04-16 15:20:36 -07001159 int ikeys[2] = { dev->ifindex, 0 };
Tom Tucker8d717402006-07-30 20:43:36 -07001160 struct netevent_redirect netevent;
Denis V. Lunev317805b2008-02-28 20:50:06 -08001161 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001162
Linus Torvalds1da177e2005-04-16 15:20:36 -07001163 if (!in_dev)
1164 return;
1165
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001166 net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001167 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001168 || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
Joe Perchesf97c1e02007-12-16 13:45:43 -08001169 || ipv4_is_zeronet(new_gw))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001170 goto reject_redirect;
1171
1172 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1173 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1174 goto reject_redirect;
1175 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1176 goto reject_redirect;
1177 } else {
Denis V. Lunev317805b2008-02-28 20:50:06 -08001178 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001179 goto reject_redirect;
1180 }
1181
1182 for (i = 0; i < 2; i++) {
1183 for (k = 0; k < 2; k++) {
Al Viro8c7bc842006-09-26 21:26:19 -07001184 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001185
1186 rthp=&rt_hash_table[hash].chain;
1187
1188 rcu_read_lock();
1189 while ((rth = rcu_dereference(*rthp)) != NULL) {
1190 struct rtable *rt;
1191
1192 if (rth->fl.fl4_dst != daddr ||
1193 rth->fl.fl4_src != skeys[i] ||
Linus Torvalds1da177e2005-04-16 15:20:36 -07001194 rth->fl.oif != ikeys[k] ||
Eric Dumazet29e75252008-01-31 17:05:09 -08001195 rth->fl.iif != 0 ||
Denis V. Lunev317805b2008-02-28 20:50:06 -08001196 rth->rt_genid != atomic_read(&rt_genid) ||
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09001197 !net_eq(dev_net(rth->u.dst.dev), net)) {
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001198 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001199 continue;
1200 }
1201
1202 if (rth->rt_dst != daddr ||
1203 rth->rt_src != saddr ||
1204 rth->u.dst.error ||
1205 rth->rt_gateway != old_gw ||
1206 rth->u.dst.dev != dev)
1207 break;
1208
1209 dst_hold(&rth->u.dst);
1210 rcu_read_unlock();
1211
1212 rt = dst_alloc(&ipv4_dst_ops);
1213 if (rt == NULL) {
1214 ip_rt_put(rth);
1215 in_dev_put(in_dev);
1216 return;
1217 }
1218
1219 /* Copy all the information. */
1220 *rt = *rth;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001221 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001222 rt->u.dst.__use = 1;
1223 atomic_set(&rt->u.dst.__refcnt, 1);
1224 rt->u.dst.child = NULL;
1225 if (rt->u.dst.dev)
1226 dev_hold(rt->u.dst.dev);
1227 if (rt->idev)
1228 in_dev_hold(rt->idev);
1229 rt->u.dst.obsolete = 0;
1230 rt->u.dst.lastuse = jiffies;
1231 rt->u.dst.path = &rt->u.dst;
1232 rt->u.dst.neighbour = NULL;
1233 rt->u.dst.hh = NULL;
1234 rt->u.dst.xfrm = NULL;
Eric Dumazet29e75252008-01-31 17:05:09 -08001235 rt->rt_genid = atomic_read(&rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001236 rt->rt_flags |= RTCF_REDIRECTED;
1237
1238 /* Gateway is different ... */
1239 rt->rt_gateway = new_gw;
1240
1241 /* Redirect received -> path was valid */
1242 dst_confirm(&rth->u.dst);
1243
1244 if (rt->peer)
1245 atomic_inc(&rt->peer->refcnt);
1246
1247 if (arp_bind_neighbour(&rt->u.dst) ||
1248 !(rt->u.dst.neighbour->nud_state &
1249 NUD_VALID)) {
1250 if (rt->u.dst.neighbour)
1251 neigh_event_send(rt->u.dst.neighbour, NULL);
1252 ip_rt_put(rth);
1253 rt_drop(rt);
1254 goto do_next;
1255 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001256
Tom Tucker8d717402006-07-30 20:43:36 -07001257 netevent.old = &rth->u.dst;
1258 netevent.new = &rt->u.dst;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001259 call_netevent_notifiers(NETEVENT_REDIRECT,
1260 &netevent);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001261
1262 rt_del(hash, rth);
1263 if (!rt_intern_hash(hash, rt, &rt))
1264 ip_rt_put(rt);
1265 goto do_next;
1266 }
1267 rcu_read_unlock();
1268 do_next:
1269 ;
1270 }
1271 }
1272 in_dev_put(in_dev);
1273 return;
1274
1275reject_redirect:
1276#ifdef CONFIG_IP_ROUTE_VERBOSE
1277 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
YOSHIFUJI Hideakia7d632b2008-04-14 04:09:00 -07001278 printk(KERN_INFO "Redirect from " NIPQUAD_FMT " on %s about "
1279 NIPQUAD_FMT " ignored.\n"
1280 " Advised path = " NIPQUAD_FMT " -> " NIPQUAD_FMT "\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07001281 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
Ilia Sotnikovcef26852006-03-25 01:38:55 -08001282 NIPQUAD(saddr), NIPQUAD(daddr));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001283#endif
1284 in_dev_put(in_dev);
1285}
1286
1287static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1288{
Eric Dumazetee6b9672008-03-05 18:30:47 -08001289 struct rtable *rt = (struct rtable *)dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001290 struct dst_entry *ret = dst;
1291
1292 if (rt) {
1293 if (dst->obsolete) {
1294 ip_rt_put(rt);
1295 ret = NULL;
1296 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1297 rt->u.dst.expires) {
Al Viro8c7bc842006-09-26 21:26:19 -07001298 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1299 rt->fl.oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001300#if RT_CACHE_DEBUG >= 1
Denis V. Lunev56c99d02007-12-06 02:19:07 -08001301 printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
YOSHIFUJI Hideakia7d632b2008-04-14 04:09:00 -07001302 NIPQUAD_FMT "/%02x dropped\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07001303 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1304#endif
1305 rt_del(hash, rt);
1306 ret = NULL;
1307 }
1308 }
1309 return ret;
1310}
1311
1312/*
1313 * Algorithm:
1314 * 1. The first ip_rt_redirect_number redirects are sent
1315 * with exponential backoff, then we stop sending them at all,
1316 * assuming that the host ignores our redirects.
1317 * 2. If we did not see packets requiring redirects
1318 * during ip_rt_redirect_silence, we assume that the host
1319 * forgot redirected route and start to send redirects again.
1320 *
1321 * This algorithm is much cheaper and more intelligent than dumb load limiting
1322 * in icmp.c.
1323 *
1324 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1325 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1326 */
1327
1328void ip_rt_send_redirect(struct sk_buff *skb)
1329{
Eric Dumazetee6b9672008-03-05 18:30:47 -08001330 struct rtable *rt = skb->rtable;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001331 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1332
1333 if (!in_dev)
1334 return;
1335
1336 if (!IN_DEV_TX_REDIRECTS(in_dev))
1337 goto out;
1338
1339 /* No redirected packets during ip_rt_redirect_silence;
1340 * reset the algorithm.
1341 */
1342 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1343 rt->u.dst.rate_tokens = 0;
1344
1345 /* Too many ignored redirects; do not send anything
1346 * set u.dst.rate_last to the last seen redirected packet.
1347 */
1348 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1349 rt->u.dst.rate_last = jiffies;
1350 goto out;
1351 }
1352
1353 /* Check for load limit; set rate_last to the latest sent
1354 * redirect.
1355 */
Li Yewang14fb8a72006-12-18 00:26:35 -08001356 if (rt->u.dst.rate_tokens == 0 ||
1357 time_after(jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001358 (rt->u.dst.rate_last +
1359 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1360 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1361 rt->u.dst.rate_last = jiffies;
1362 ++rt->u.dst.rate_tokens;
1363#ifdef CONFIG_IP_ROUTE_VERBOSE
1364 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1365 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1366 net_ratelimit())
YOSHIFUJI Hideakia7d632b2008-04-14 04:09:00 -07001367 printk(KERN_WARNING "host " NIPQUAD_FMT "/if%d ignores "
1368 "redirects for " NIPQUAD_FMT " to " NIPQUAD_FMT ".\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07001369 NIPQUAD(rt->rt_src), rt->rt_iif,
1370 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1371#endif
1372 }
1373out:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001374 in_dev_put(in_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001375}
1376
1377static int ip_error(struct sk_buff *skb)
1378{
Eric Dumazetee6b9672008-03-05 18:30:47 -08001379 struct rtable *rt = skb->rtable;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001380 unsigned long now;
1381 int code;
1382
1383 switch (rt->u.dst.error) {
1384 case EINVAL:
1385 default:
1386 goto out;
1387 case EHOSTUNREACH:
1388 code = ICMP_HOST_UNREACH;
1389 break;
1390 case ENETUNREACH:
1391 code = ICMP_NET_UNREACH;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08001392 IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001393 break;
1394 case EACCES:
1395 code = ICMP_PKT_FILTERED;
1396 break;
1397 }
1398
1399 now = jiffies;
1400 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1401 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1402 rt->u.dst.rate_tokens = ip_rt_error_burst;
1403 rt->u.dst.rate_last = now;
1404 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1405 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1406 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1407 }
1408
1409out: kfree_skb(skb);
1410 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001411}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001412
1413/*
1414 * The last two values are not from the RFC but
1415 * are needed for AMPRnet AX.25 paths.
1416 */
1417
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001418static const unsigned short mtu_plateau[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -07001419{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1420
Stephen Hemminger5969f712008-04-10 01:52:09 -07001421static inline unsigned short guess_mtu(unsigned short old_mtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001422{
1423 int i;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001424
Linus Torvalds1da177e2005-04-16 15:20:36 -07001425 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1426 if (old_mtu > mtu_plateau[i])
1427 return mtu_plateau[i];
1428 return 68;
1429}
1430
Denis V. Lunevb5921912008-01-22 23:50:25 -08001431unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1432 unsigned short new_mtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001433{
1434 int i;
1435 unsigned short old_mtu = ntohs(iph->tot_len);
1436 struct rtable *rth;
Al Viroe4485152006-09-26 22:15:01 -07001437 __be32 skeys[2] = { iph->saddr, 0, };
1438 __be32 daddr = iph->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001439 unsigned short est_mtu = 0;
1440
1441 if (ipv4_config.no_pmtu_disc)
1442 return 0;
1443
1444 for (i = 0; i < 2; i++) {
Al Viro8c7bc842006-09-26 21:26:19 -07001445 unsigned hash = rt_hash(daddr, skeys[i], 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001446
1447 rcu_read_lock();
1448 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001449 rth = rcu_dereference(rth->u.dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001450 if (rth->fl.fl4_dst == daddr &&
1451 rth->fl.fl4_src == skeys[i] &&
1452 rth->rt_dst == daddr &&
1453 rth->rt_src == iph->saddr &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001454 rth->fl.iif == 0 &&
Denis V. Lunevb5921912008-01-22 23:50:25 -08001455 !(dst_metric_locked(&rth->u.dst, RTAX_MTU)) &&
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09001456 net_eq(dev_net(rth->u.dst.dev), net) &&
Eric Dumazet29e75252008-01-31 17:05:09 -08001457 rth->rt_genid == atomic_read(&rt_genid)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001458 unsigned short mtu = new_mtu;
1459
1460 if (new_mtu < 68 || new_mtu >= old_mtu) {
1461
1462 /* BSD 4.2 compatibility hack :-( */
1463 if (mtu == 0 &&
1464 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1465 old_mtu >= 68 + (iph->ihl << 2))
1466 old_mtu -= iph->ihl << 2;
1467
1468 mtu = guess_mtu(old_mtu);
1469 }
1470 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001471 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001472 dst_confirm(&rth->u.dst);
1473 if (mtu < ip_rt_min_pmtu) {
1474 mtu = ip_rt_min_pmtu;
1475 rth->u.dst.metrics[RTAX_LOCK-1] |=
1476 (1 << RTAX_MTU);
1477 }
1478 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1479 dst_set_expires(&rth->u.dst,
1480 ip_rt_mtu_expires);
1481 }
1482 est_mtu = mtu;
1483 }
1484 }
1485 }
1486 rcu_read_unlock();
1487 }
1488 return est_mtu ? : new_mtu;
1489}
1490
1491static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1492{
1493 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1494 !(dst_metric_locked(dst, RTAX_MTU))) {
1495 if (mtu < ip_rt_min_pmtu) {
1496 mtu = ip_rt_min_pmtu;
1497 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1498 }
1499 dst->metrics[RTAX_MTU-1] = mtu;
1500 dst_set_expires(dst, ip_rt_mtu_expires);
Tom Tucker8d717402006-07-30 20:43:36 -07001501 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001502 }
1503}
1504
1505static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1506{
1507 return NULL;
1508}
1509
1510static void ipv4_dst_destroy(struct dst_entry *dst)
1511{
1512 struct rtable *rt = (struct rtable *) dst;
1513 struct inet_peer *peer = rt->peer;
1514 struct in_device *idev = rt->idev;
1515
1516 if (peer) {
1517 rt->peer = NULL;
1518 inet_putpeer(peer);
1519 }
1520
1521 if (idev) {
1522 rt->idev = NULL;
1523 in_dev_put(idev);
1524 }
1525}
1526
1527static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1528 int how)
1529{
1530 struct rtable *rt = (struct rtable *) dst;
1531 struct in_device *idev = rt->idev;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001532 if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
Denis V. Lunev5a3e55d2007-12-07 00:38:10 -08001533 struct in_device *loopback_idev =
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001534 in_dev_get(dev_net(dev)->loopback_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001535 if (loopback_idev) {
1536 rt->idev = loopback_idev;
1537 in_dev_put(idev);
1538 }
1539 }
1540}
1541
1542static void ipv4_link_failure(struct sk_buff *skb)
1543{
1544 struct rtable *rt;
1545
1546 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1547
Eric Dumazetee6b9672008-03-05 18:30:47 -08001548 rt = skb->rtable;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001549 if (rt)
1550 dst_set_expires(&rt->u.dst, 0);
1551}
1552
1553static int ip_rt_bug(struct sk_buff *skb)
1554{
YOSHIFUJI Hideakia7d632b2008-04-14 04:09:00 -07001555 printk(KERN_DEBUG "ip_rt_bug: " NIPQUAD_FMT " -> " NIPQUAD_FMT ", %s\n",
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001556 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
Linus Torvalds1da177e2005-04-16 15:20:36 -07001557 skb->dev ? skb->dev->name : "?");
1558 kfree_skb(skb);
1559 return 0;
1560}
1561
1562/*
1563 We do not cache source address of outgoing interface,
1564 because it is used only by IP RR, TS and SRR options,
1565 so that it out of fast path.
1566
1567 BTW remember: "addr" is allowed to be not aligned
1568 in IP options!
1569 */
1570
1571void ip_rt_get_source(u8 *addr, struct rtable *rt)
1572{
Al Viroa61ced52006-09-26 21:27:54 -07001573 __be32 src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001574 struct fib_result res;
1575
1576 if (rt->fl.iif == 0)
1577 src = rt->rt_src;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001578 else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001579 src = FIB_RES_PREFSRC(res);
1580 fib_res_put(&res);
1581 } else
1582 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1583 RT_SCOPE_UNIVERSE);
1584 memcpy(addr, &src, 4);
1585}
1586
1587#ifdef CONFIG_NET_CLS_ROUTE
1588static void set_class_tag(struct rtable *rt, u32 tag)
1589{
1590 if (!(rt->u.dst.tclassid & 0xFFFF))
1591 rt->u.dst.tclassid |= tag & 0xFFFF;
1592 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1593 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1594}
1595#endif
1596
1597static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1598{
1599 struct fib_info *fi = res->fi;
1600
1601 if (fi) {
1602 if (FIB_RES_GW(*res) &&
1603 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1604 rt->rt_gateway = FIB_RES_GW(*res);
1605 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1606 sizeof(rt->u.dst.metrics));
1607 if (fi->fib_mtu == 0) {
1608 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1609 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1610 rt->rt_gateway != rt->rt_dst &&
1611 rt->u.dst.dev->mtu > 576)
1612 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1613 }
1614#ifdef CONFIG_NET_CLS_ROUTE
1615 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1616#endif
1617 } else
1618 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1619
1620 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1621 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1622 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1623 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1624 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1625 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1626 ip_rt_min_advmss);
1627 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1628 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1629
1630#ifdef CONFIG_NET_CLS_ROUTE
1631#ifdef CONFIG_IP_MULTIPLE_TABLES
1632 set_class_tag(rt, fib_rules_tclass(res));
1633#endif
1634 set_class_tag(rt, itag);
1635#endif
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001636 rt->rt_type = res->type;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001637}
1638
Al Viro9e12bb22006-09-26 21:25:20 -07001639static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001640 u8 tos, struct net_device *dev, int our)
1641{
1642 unsigned hash;
1643 struct rtable *rth;
Al Viroa61ced52006-09-26 21:27:54 -07001644 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001645 struct in_device *in_dev = in_dev_get(dev);
1646 u32 itag = 0;
1647
1648 /* Primary sanity checks. */
1649
1650 if (in_dev == NULL)
1651 return -EINVAL;
1652
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001653 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08001654 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001655 goto e_inval;
1656
Joe Perchesf97c1e02007-12-16 13:45:43 -08001657 if (ipv4_is_zeronet(saddr)) {
1658 if (!ipv4_is_local_multicast(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001659 goto e_inval;
1660 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1661 } else if (fib_validate_source(saddr, 0, tos, 0,
1662 dev, &spec_dst, &itag) < 0)
1663 goto e_inval;
1664
1665 rth = dst_alloc(&ipv4_dst_ops);
1666 if (!rth)
1667 goto e_nobufs;
1668
1669 rth->u.dst.output= ip_rt_bug;
1670
1671 atomic_set(&rth->u.dst.__refcnt, 1);
1672 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07001673 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001674 rth->u.dst.flags |= DST_NOPOLICY;
1675 rth->fl.fl4_dst = daddr;
1676 rth->rt_dst = daddr;
1677 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001678 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001679 rth->fl.fl4_src = saddr;
1680 rth->rt_src = saddr;
1681#ifdef CONFIG_NET_CLS_ROUTE
1682 rth->u.dst.tclassid = itag;
1683#endif
1684 rth->rt_iif =
1685 rth->fl.iif = dev->ifindex;
Eric W. Biederman2774c7a2007-09-26 22:10:56 -07001686 rth->u.dst.dev = init_net.loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001687 dev_hold(rth->u.dst.dev);
1688 rth->idev = in_dev_get(rth->u.dst.dev);
1689 rth->fl.oif = 0;
1690 rth->rt_gateway = daddr;
1691 rth->rt_spec_dst= spec_dst;
Eric Dumazet29e75252008-01-31 17:05:09 -08001692 rth->rt_genid = atomic_read(&rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001693 rth->rt_flags = RTCF_MULTICAST;
Eric Dumazet29e75252008-01-31 17:05:09 -08001694 rth->rt_type = RTN_MULTICAST;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001695 if (our) {
1696 rth->u.dst.input= ip_local_deliver;
1697 rth->rt_flags |= RTCF_LOCAL;
1698 }
1699
1700#ifdef CONFIG_IP_MROUTE
Joe Perchesf97c1e02007-12-16 13:45:43 -08001701 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001702 rth->u.dst.input = ip_mr_input;
1703#endif
1704 RT_CACHE_STAT_INC(in_slow_mc);
1705
1706 in_dev_put(in_dev);
Al Viro8c7bc842006-09-26 21:26:19 -07001707 hash = rt_hash(daddr, saddr, dev->ifindex);
Eric Dumazetee6b9672008-03-05 18:30:47 -08001708 return rt_intern_hash(hash, rth, &skb->rtable);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001709
1710e_nobufs:
1711 in_dev_put(in_dev);
1712 return -ENOBUFS;
1713
1714e_inval:
1715 in_dev_put(in_dev);
1716 return -EINVAL;
1717}
1718
1719
1720static void ip_handle_martian_source(struct net_device *dev,
1721 struct in_device *in_dev,
1722 struct sk_buff *skb,
Al Viro9e12bb22006-09-26 21:25:20 -07001723 __be32 daddr,
1724 __be32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001725{
1726 RT_CACHE_STAT_INC(in_martian_src);
1727#ifdef CONFIG_IP_ROUTE_VERBOSE
1728 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1729 /*
1730 * RFC1812 recommendation, if source is martian,
1731 * the only hint is MAC header.
1732 */
YOSHIFUJI Hideakia7d632b2008-04-14 04:09:00 -07001733 printk(KERN_WARNING "martian source " NIPQUAD_FMT " from "
1734 NIPQUAD_FMT", on dev %s\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07001735 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001736 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001737 int i;
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001738 const unsigned char *p = skb_mac_header(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001739 printk(KERN_WARNING "ll header: ");
1740 for (i = 0; i < dev->hard_header_len; i++, p++) {
1741 printk("%02x", *p);
1742 if (i < (dev->hard_header_len - 1))
1743 printk(":");
1744 }
1745 printk("\n");
1746 }
1747 }
1748#endif
1749}
1750
Stephen Hemminger5969f712008-04-10 01:52:09 -07001751static int __mkroute_input(struct sk_buff *skb,
1752 struct fib_result *res,
1753 struct in_device *in_dev,
1754 __be32 daddr, __be32 saddr, u32 tos,
1755 struct rtable **result)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001756{
1757
1758 struct rtable *rth;
1759 int err;
1760 struct in_device *out_dev;
1761 unsigned flags = 0;
Al Virod9c9df82006-09-26 21:28:14 -07001762 __be32 spec_dst;
1763 u32 itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001764
1765 /* get a working reference to the output device */
1766 out_dev = in_dev_get(FIB_RES_DEV(*res));
1767 if (out_dev == NULL) {
1768 if (net_ratelimit())
1769 printk(KERN_CRIT "Bug in ip_route_input" \
1770 "_slow(). Please, report\n");
1771 return -EINVAL;
1772 }
1773
1774
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001775 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
Linus Torvalds1da177e2005-04-16 15:20:36 -07001776 in_dev->dev, &spec_dst, &itag);
1777 if (err < 0) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001778 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001779 saddr);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001780
Linus Torvalds1da177e2005-04-16 15:20:36 -07001781 err = -EINVAL;
1782 goto cleanup;
1783 }
1784
1785 if (err)
1786 flags |= RTCF_DIRECTSRC;
1787
Rami Rosencb7928a2008-01-09 00:18:24 -08001788 if (out_dev == in_dev && err && !(flags & RTCF_MASQ) &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001789 (IN_DEV_SHARED_MEDIA(out_dev) ||
1790 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1791 flags |= RTCF_DOREDIRECT;
1792
1793 if (skb->protocol != htons(ETH_P_IP)) {
1794 /* Not IP (i.e. ARP). Do not create route, if it is
1795 * invalid for proxy arp. DNAT routes are always valid.
1796 */
Rami Rosencb7928a2008-01-09 00:18:24 -08001797 if (out_dev == in_dev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001798 err = -EINVAL;
1799 goto cleanup;
1800 }
1801 }
1802
1803
1804 rth = dst_alloc(&ipv4_dst_ops);
1805 if (!rth) {
1806 err = -ENOBUFS;
1807 goto cleanup;
1808 }
1809
Julian Anastasovce723d82005-09-08 13:34:47 -07001810 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001811 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07001812 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001813 rth->u.dst.flags |= DST_NOPOLICY;
Herbert Xu42f811b2007-06-04 23:34:44 -07001814 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001815 rth->u.dst.flags |= DST_NOXFRM;
1816 rth->fl.fl4_dst = daddr;
1817 rth->rt_dst = daddr;
1818 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001819 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001820 rth->fl.fl4_src = saddr;
1821 rth->rt_src = saddr;
1822 rth->rt_gateway = daddr;
1823 rth->rt_iif =
1824 rth->fl.iif = in_dev->dev->ifindex;
1825 rth->u.dst.dev = (out_dev)->dev;
1826 dev_hold(rth->u.dst.dev);
1827 rth->idev = in_dev_get(rth->u.dst.dev);
1828 rth->fl.oif = 0;
1829 rth->rt_spec_dst= spec_dst;
1830
1831 rth->u.dst.input = ip_forward;
1832 rth->u.dst.output = ip_output;
Eric Dumazet29e75252008-01-31 17:05:09 -08001833 rth->rt_genid = atomic_read(&rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001834
1835 rt_set_nexthop(rth, res, itag);
1836
1837 rth->rt_flags = flags;
1838
1839 *result = rth;
1840 err = 0;
1841 cleanup:
1842 /* release the working reference to the output device */
1843 in_dev_put(out_dev);
1844 return err;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001845}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001846
Stephen Hemminger5969f712008-04-10 01:52:09 -07001847static int ip_mkroute_input(struct sk_buff *skb,
1848 struct fib_result *res,
1849 const struct flowi *fl,
1850 struct in_device *in_dev,
1851 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001852{
Chuck Short7abaa272005-06-22 22:10:23 -07001853 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001854 int err;
1855 unsigned hash;
1856
1857#ifdef CONFIG_IP_ROUTE_MULTIPATH
1858 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1859 fib_select_multipath(fl, res);
1860#endif
1861
1862 /* create a routing cache entry */
1863 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1864 if (err)
1865 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001866
1867 /* put it into the cache */
Al Viro8c7bc842006-09-26 21:26:19 -07001868 hash = rt_hash(daddr, saddr, fl->iif);
Eric Dumazetee6b9672008-03-05 18:30:47 -08001869 return rt_intern_hash(hash, rth, &skb->rtable);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001870}
1871
Linus Torvalds1da177e2005-04-16 15:20:36 -07001872/*
1873 * NOTE. We drop all the packets that has local source
1874 * addresses, because every properly looped back packet
1875 * must have correct destination already attached by output routine.
1876 *
1877 * Such approach solves two big problems:
1878 * 1. Not simplex devices are handled properly.
1879 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1880 */
1881
Al Viro9e12bb22006-09-26 21:25:20 -07001882static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001883 u8 tos, struct net_device *dev)
1884{
1885 struct fib_result res;
1886 struct in_device *in_dev = in_dev_get(dev);
1887 struct flowi fl = { .nl_u = { .ip4_u =
1888 { .daddr = daddr,
1889 .saddr = saddr,
1890 .tos = tos,
1891 .scope = RT_SCOPE_UNIVERSE,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001892 } },
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001893 .mark = skb->mark,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001894 .iif = dev->ifindex };
1895 unsigned flags = 0;
1896 u32 itag = 0;
1897 struct rtable * rth;
1898 unsigned hash;
Al Viro9e12bb22006-09-26 21:25:20 -07001899 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001900 int err = -EINVAL;
1901 int free_res = 0;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001902 struct net * net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001903
1904 /* IP on this device is disabled. */
1905
1906 if (!in_dev)
1907 goto out;
1908
1909 /* Check for the most weird martians, which can be not detected
1910 by fib_lookup.
1911 */
1912
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001913 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08001914 ipv4_is_loopback(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001915 goto martian_source;
1916
Al Viroe4485152006-09-26 22:15:01 -07001917 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001918 goto brd_input;
1919
1920 /* Accept zero addresses only to limited broadcast;
1921 * I even do not know to fix it or not. Waiting for complains :-)
1922 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08001923 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001924 goto martian_source;
1925
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001926 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08001927 ipv4_is_loopback(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001928 goto martian_destination;
1929
1930 /*
1931 * Now we are ready to route packet.
1932 */
Denis V. Lunev84a885f2008-01-21 17:34:35 -08001933 if ((err = fib_lookup(net, &fl, &res)) != 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001934 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001935 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001936 goto no_route;
1937 }
1938 free_res = 1;
1939
1940 RT_CACHE_STAT_INC(in_slow_tot);
1941
1942 if (res.type == RTN_BROADCAST)
1943 goto brd_input;
1944
1945 if (res.type == RTN_LOCAL) {
1946 int result;
1947 result = fib_validate_source(saddr, daddr, tos,
Denis V. Lunev84a885f2008-01-21 17:34:35 -08001948 net->loopback_dev->ifindex,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001949 dev, &spec_dst, &itag);
1950 if (result < 0)
1951 goto martian_source;
1952 if (result)
1953 flags |= RTCF_DIRECTSRC;
1954 spec_dst = daddr;
1955 goto local_input;
1956 }
1957
1958 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001959 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001960 if (res.type != RTN_UNICAST)
1961 goto martian_destination;
1962
1963 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001964done:
1965 in_dev_put(in_dev);
1966 if (free_res)
1967 fib_res_put(&res);
1968out: return err;
1969
1970brd_input:
1971 if (skb->protocol != htons(ETH_P_IP))
1972 goto e_inval;
1973
Joe Perchesf97c1e02007-12-16 13:45:43 -08001974 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001975 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1976 else {
1977 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1978 &itag);
1979 if (err < 0)
1980 goto martian_source;
1981 if (err)
1982 flags |= RTCF_DIRECTSRC;
1983 }
1984 flags |= RTCF_BROADCAST;
1985 res.type = RTN_BROADCAST;
1986 RT_CACHE_STAT_INC(in_brd);
1987
1988local_input:
1989 rth = dst_alloc(&ipv4_dst_ops);
1990 if (!rth)
1991 goto e_nobufs;
1992
1993 rth->u.dst.output= ip_rt_bug;
Eric Dumazet29e75252008-01-31 17:05:09 -08001994 rth->rt_genid = atomic_read(&rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001995
1996 atomic_set(&rth->u.dst.__refcnt, 1);
1997 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07001998 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001999 rth->u.dst.flags |= DST_NOPOLICY;
2000 rth->fl.fl4_dst = daddr;
2001 rth->rt_dst = daddr;
2002 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002003 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002004 rth->fl.fl4_src = saddr;
2005 rth->rt_src = saddr;
2006#ifdef CONFIG_NET_CLS_ROUTE
2007 rth->u.dst.tclassid = itag;
2008#endif
2009 rth->rt_iif =
2010 rth->fl.iif = dev->ifindex;
Denis V. Lunev84a885f2008-01-21 17:34:35 -08002011 rth->u.dst.dev = net->loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002012 dev_hold(rth->u.dst.dev);
2013 rth->idev = in_dev_get(rth->u.dst.dev);
2014 rth->rt_gateway = daddr;
2015 rth->rt_spec_dst= spec_dst;
2016 rth->u.dst.input= ip_local_deliver;
2017 rth->rt_flags = flags|RTCF_LOCAL;
2018 if (res.type == RTN_UNREACHABLE) {
2019 rth->u.dst.input= ip_error;
2020 rth->u.dst.error= -err;
2021 rth->rt_flags &= ~RTCF_LOCAL;
2022 }
2023 rth->rt_type = res.type;
Al Viro8c7bc842006-09-26 21:26:19 -07002024 hash = rt_hash(daddr, saddr, fl.iif);
Eric Dumazetee6b9672008-03-05 18:30:47 -08002025 err = rt_intern_hash(hash, rth, &skb->rtable);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002026 goto done;
2027
2028no_route:
2029 RT_CACHE_STAT_INC(in_no_route);
2030 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2031 res.type = RTN_UNREACHABLE;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08002032 if (err == -ESRCH)
2033 err = -ENETUNREACH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002034 goto local_input;
2035
2036 /*
2037 * Do not cache martian addresses: they should be logged (RFC1812)
2038 */
2039martian_destination:
2040 RT_CACHE_STAT_INC(in_martian_dst);
2041#ifdef CONFIG_IP_ROUTE_VERBOSE
2042 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
YOSHIFUJI Hideakia7d632b2008-04-14 04:09:00 -07002043 printk(KERN_WARNING "martian destination " NIPQUAD_FMT " from "
2044 NIPQUAD_FMT ", dev %s\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07002045 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2046#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002047
2048e_hostunreach:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002049 err = -EHOSTUNREACH;
2050 goto done;
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002051
Linus Torvalds1da177e2005-04-16 15:20:36 -07002052e_inval:
2053 err = -EINVAL;
2054 goto done;
2055
2056e_nobufs:
2057 err = -ENOBUFS;
2058 goto done;
2059
2060martian_source:
2061 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2062 goto e_inval;
2063}
2064
Al Viro9e12bb22006-09-26 21:25:20 -07002065int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002066 u8 tos, struct net_device *dev)
2067{
2068 struct rtable * rth;
2069 unsigned hash;
2070 int iif = dev->ifindex;
Denis V. Lunevb5921912008-01-22 23:50:25 -08002071 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002072
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002073 net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002074 tos &= IPTOS_RT_MASK;
Al Viro8c7bc842006-09-26 21:26:19 -07002075 hash = rt_hash(daddr, saddr, iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002076
2077 rcu_read_lock();
2078 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08002079 rth = rcu_dereference(rth->u.dst.rt_next)) {
Stephen Hemmingerc0b8c322008-04-10 04:00:28 -07002080 if (((rth->fl.fl4_dst ^ daddr) |
2081 (rth->fl.fl4_src ^ saddr) |
2082 (rth->fl.iif ^ iif) |
2083 rth->fl.oif |
2084 (rth->fl.fl4_tos ^ tos)) == 0 &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002085 rth->fl.mark == skb->mark &&
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09002086 net_eq(dev_net(rth->u.dst.dev), net) &&
Eric Dumazet29e75252008-01-31 17:05:09 -08002087 rth->rt_genid == atomic_read(&rt_genid)) {
Pavel Emelyanov03f49f32007-11-10 21:28:34 -08002088 dst_use(&rth->u.dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002089 RT_CACHE_STAT_INC(in_hit);
2090 rcu_read_unlock();
Eric Dumazetee6b9672008-03-05 18:30:47 -08002091 skb->rtable = rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002092 return 0;
2093 }
2094 RT_CACHE_STAT_INC(in_hlist_search);
2095 }
2096 rcu_read_unlock();
2097
2098 /* Multicast recognition logic is moved from route cache to here.
2099 The problem was that too many Ethernet cards have broken/missing
2100 hardware multicast filters :-( As result the host on multicasting
2101 network acquires a lot of useless route cache entries, sort of
2102 SDR messages from all the world. Now we try to get rid of them.
2103 Really, provided software IP multicast filter is organized
2104 reasonably (at least, hashed), it does not result in a slowdown
2105 comparing with route cache reject entries.
2106 Note, that multicast routers are not affected, because
2107 route cache entry is created eventually.
2108 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002109 if (ipv4_is_multicast(daddr)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002110 struct in_device *in_dev;
2111
2112 rcu_read_lock();
Herbert Xue5ed6392005-10-03 14:35:55 -07002113 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002114 int our = ip_check_mc(in_dev, daddr, saddr,
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07002115 ip_hdr(skb)->protocol);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002116 if (our
2117#ifdef CONFIG_IP_MROUTE
Joe Perchesf97c1e02007-12-16 13:45:43 -08002118 || (!ipv4_is_local_multicast(daddr) &&
2119 IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002120#endif
2121 ) {
2122 rcu_read_unlock();
2123 return ip_route_input_mc(skb, daddr, saddr,
2124 tos, dev, our);
2125 }
2126 }
2127 rcu_read_unlock();
2128 return -EINVAL;
2129 }
2130 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2131}
2132
Stephen Hemminger5969f712008-04-10 01:52:09 -07002133static int __mkroute_output(struct rtable **result,
2134 struct fib_result *res,
2135 const struct flowi *fl,
2136 const struct flowi *oldflp,
2137 struct net_device *dev_out,
2138 unsigned flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002139{
2140 struct rtable *rth;
2141 struct in_device *in_dev;
2142 u32 tos = RT_FL_TOS(oldflp);
2143 int err = 0;
2144
Joe Perchesf97c1e02007-12-16 13:45:43 -08002145 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002146 return -EINVAL;
2147
Al Viroe4485152006-09-26 22:15:01 -07002148 if (fl->fl4_dst == htonl(0xFFFFFFFF))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002149 res->type = RTN_BROADCAST;
Joe Perchesf97c1e02007-12-16 13:45:43 -08002150 else if (ipv4_is_multicast(fl->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002151 res->type = RTN_MULTICAST;
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002152 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002153 return -EINVAL;
2154
2155 if (dev_out->flags & IFF_LOOPBACK)
2156 flags |= RTCF_LOCAL;
2157
2158 /* get work reference to inet device */
2159 in_dev = in_dev_get(dev_out);
2160 if (!in_dev)
2161 return -EINVAL;
2162
2163 if (res->type == RTN_BROADCAST) {
2164 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2165 if (res->fi) {
2166 fib_info_put(res->fi);
2167 res->fi = NULL;
2168 }
2169 } else if (res->type == RTN_MULTICAST) {
2170 flags |= RTCF_MULTICAST|RTCF_LOCAL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002171 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002172 oldflp->proto))
2173 flags &= ~RTCF_LOCAL;
2174 /* If multicast route do not exist use
2175 default one, but do not gateway in this case.
2176 Yes, it is hack.
2177 */
2178 if (res->fi && res->prefixlen < 4) {
2179 fib_info_put(res->fi);
2180 res->fi = NULL;
2181 }
2182 }
2183
2184
2185 rth = dst_alloc(&ipv4_dst_ops);
2186 if (!rth) {
2187 err = -ENOBUFS;
2188 goto cleanup;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002189 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002190
Julian Anastasovce723d82005-09-08 13:34:47 -07002191 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002192 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07002193 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002194 rth->u.dst.flags |= DST_NOXFRM;
Herbert Xu42f811b2007-06-04 23:34:44 -07002195 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002196 rth->u.dst.flags |= DST_NOPOLICY;
2197
2198 rth->fl.fl4_dst = oldflp->fl4_dst;
2199 rth->fl.fl4_tos = tos;
2200 rth->fl.fl4_src = oldflp->fl4_src;
2201 rth->fl.oif = oldflp->oif;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002202 rth->fl.mark = oldflp->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002203 rth->rt_dst = fl->fl4_dst;
2204 rth->rt_src = fl->fl4_src;
2205 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002206 /* get references to the devices that are to be hold by the routing
Linus Torvalds1da177e2005-04-16 15:20:36 -07002207 cache entry */
2208 rth->u.dst.dev = dev_out;
2209 dev_hold(dev_out);
2210 rth->idev = in_dev_get(dev_out);
2211 rth->rt_gateway = fl->fl4_dst;
2212 rth->rt_spec_dst= fl->fl4_src;
2213
2214 rth->u.dst.output=ip_output;
Eric Dumazet29e75252008-01-31 17:05:09 -08002215 rth->rt_genid = atomic_read(&rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002216
2217 RT_CACHE_STAT_INC(out_slow_tot);
2218
2219 if (flags & RTCF_LOCAL) {
2220 rth->u.dst.input = ip_local_deliver;
2221 rth->rt_spec_dst = fl->fl4_dst;
2222 }
2223 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2224 rth->rt_spec_dst = fl->fl4_src;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002225 if (flags & RTCF_LOCAL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002226 !(dev_out->flags & IFF_LOOPBACK)) {
2227 rth->u.dst.output = ip_mc_output;
2228 RT_CACHE_STAT_INC(out_slow_mc);
2229 }
2230#ifdef CONFIG_IP_MROUTE
2231 if (res->type == RTN_MULTICAST) {
2232 if (IN_DEV_MFORWARD(in_dev) &&
Joe Perchesf97c1e02007-12-16 13:45:43 -08002233 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002234 rth->u.dst.input = ip_mr_input;
2235 rth->u.dst.output = ip_mc_output;
2236 }
2237 }
2238#endif
2239 }
2240
2241 rt_set_nexthop(rth, res, 0);
2242
2243 rth->rt_flags = flags;
2244
2245 *result = rth;
2246 cleanup:
2247 /* release work reference to inet device */
2248 in_dev_put(in_dev);
2249
2250 return err;
2251}
2252
Stephen Hemminger5969f712008-04-10 01:52:09 -07002253static int ip_mkroute_output(struct rtable **rp,
2254 struct fib_result *res,
2255 const struct flowi *fl,
2256 const struct flowi *oldflp,
2257 struct net_device *dev_out,
2258 unsigned flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002259{
Chuck Short7abaa272005-06-22 22:10:23 -07002260 struct rtable *rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002261 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2262 unsigned hash;
2263 if (err == 0) {
Al Viro8c7bc842006-09-26 21:26:19 -07002264 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002265 err = rt_intern_hash(hash, rth, rp);
2266 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002267
Linus Torvalds1da177e2005-04-16 15:20:36 -07002268 return err;
2269}
2270
Linus Torvalds1da177e2005-04-16 15:20:36 -07002271/*
2272 * Major route resolver routine.
2273 */
2274
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002275static int ip_route_output_slow(struct net *net, struct rtable **rp,
2276 const struct flowi *oldflp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002277{
2278 u32 tos = RT_FL_TOS(oldflp);
2279 struct flowi fl = { .nl_u = { .ip4_u =
2280 { .daddr = oldflp->fl4_dst,
2281 .saddr = oldflp->fl4_src,
2282 .tos = tos & IPTOS_RT_MASK,
2283 .scope = ((tos & RTO_ONLINK) ?
2284 RT_SCOPE_LINK :
2285 RT_SCOPE_UNIVERSE),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002286 } },
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002287 .mark = oldflp->mark,
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002288 .iif = net->loopback_dev->ifindex,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002289 .oif = oldflp->oif };
2290 struct fib_result res;
2291 unsigned flags = 0;
2292 struct net_device *dev_out = NULL;
2293 int free_res = 0;
2294 int err;
2295
2296
2297 res.fi = NULL;
2298#ifdef CONFIG_IP_MULTIPLE_TABLES
2299 res.r = NULL;
2300#endif
2301
2302 if (oldflp->fl4_src) {
2303 err = -EINVAL;
Joe Perchesf97c1e02007-12-16 13:45:43 -08002304 if (ipv4_is_multicast(oldflp->fl4_src) ||
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002305 ipv4_is_lbcast(oldflp->fl4_src) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002306 ipv4_is_zeronet(oldflp->fl4_src))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002307 goto out;
2308
2309 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002310 dev_out = ip_dev_find(net, oldflp->fl4_src);
David S. Millerf6c5d732007-05-18 02:07:50 -07002311 if (dev_out == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002312 goto out;
2313
2314 /* I removed check for oif == dev_out->oif here.
2315 It was wrong for two reasons:
Denis V. Lunev1ab35272008-01-22 22:04:30 -08002316 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2317 is assigned to multiple interfaces.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002318 2. Moreover, we are allowed to send packets with saddr
2319 of another iface. --ANK
2320 */
2321
David S. Millerf6c5d732007-05-18 02:07:50 -07002322 if (oldflp->oif == 0
Joe Perchesf97c1e02007-12-16 13:45:43 -08002323 && (ipv4_is_multicast(oldflp->fl4_dst) ||
2324 oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002325 /* Special hack: user can direct multicasts
2326 and limited broadcast via necessary interface
2327 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2328 This hack is not just for fun, it allows
2329 vic,vat and friends to work.
2330 They bind socket to loopback, set ttl to zero
2331 and expect that it will work.
2332 From the viewpoint of routing cache they are broken,
2333 because we are not allowed to build multicast path
2334 with loopback source addr (look, routing cache
2335 cannot know, that ttl is zero, so that packet
2336 will not leave this host and route is valid).
2337 Luckily, this hack is good workaround.
2338 */
2339
2340 fl.oif = dev_out->ifindex;
2341 goto make_route;
2342 }
2343 if (dev_out)
2344 dev_put(dev_out);
2345 dev_out = NULL;
2346 }
2347
2348
2349 if (oldflp->oif) {
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002350 dev_out = dev_get_by_index(net, oldflp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002351 err = -ENODEV;
2352 if (dev_out == NULL)
2353 goto out;
Herbert Xue5ed6392005-10-03 14:35:55 -07002354
2355 /* RACE: Check return value of inet_select_addr instead. */
2356 if (__in_dev_get_rtnl(dev_out) == NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002357 dev_put(dev_out);
2358 goto out; /* Wrong error code */
2359 }
2360
Joe Perchesf97c1e02007-12-16 13:45:43 -08002361 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2362 oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002363 if (!fl.fl4_src)
2364 fl.fl4_src = inet_select_addr(dev_out, 0,
2365 RT_SCOPE_LINK);
2366 goto make_route;
2367 }
2368 if (!fl.fl4_src) {
Joe Perchesf97c1e02007-12-16 13:45:43 -08002369 if (ipv4_is_multicast(oldflp->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002370 fl.fl4_src = inet_select_addr(dev_out, 0,
2371 fl.fl4_scope);
2372 else if (!oldflp->fl4_dst)
2373 fl.fl4_src = inet_select_addr(dev_out, 0,
2374 RT_SCOPE_HOST);
2375 }
2376 }
2377
2378 if (!fl.fl4_dst) {
2379 fl.fl4_dst = fl.fl4_src;
2380 if (!fl.fl4_dst)
2381 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2382 if (dev_out)
2383 dev_put(dev_out);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002384 dev_out = net->loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002385 dev_hold(dev_out);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002386 fl.oif = net->loopback_dev->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002387 res.type = RTN_LOCAL;
2388 flags |= RTCF_LOCAL;
2389 goto make_route;
2390 }
2391
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002392 if (fib_lookup(net, &fl, &res)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002393 res.fi = NULL;
2394 if (oldflp->oif) {
2395 /* Apparently, routing tables are wrong. Assume,
2396 that the destination is on link.
2397
2398 WHY? DW.
2399 Because we are allowed to send to iface
2400 even if it has NO routes and NO assigned
2401 addresses. When oif is specified, routing
2402 tables are looked up with only one purpose:
2403 to catch if destination is gatewayed, rather than
2404 direct. Moreover, if MSG_DONTROUTE is set,
2405 we send packet, ignoring both routing tables
2406 and ifaddr state. --ANK
2407
2408
2409 We could make it even if oif is unknown,
2410 likely IPv6, but we do not.
2411 */
2412
2413 if (fl.fl4_src == 0)
2414 fl.fl4_src = inet_select_addr(dev_out, 0,
2415 RT_SCOPE_LINK);
2416 res.type = RTN_UNICAST;
2417 goto make_route;
2418 }
2419 if (dev_out)
2420 dev_put(dev_out);
2421 err = -ENETUNREACH;
2422 goto out;
2423 }
2424 free_res = 1;
2425
2426 if (res.type == RTN_LOCAL) {
2427 if (!fl.fl4_src)
2428 fl.fl4_src = fl.fl4_dst;
2429 if (dev_out)
2430 dev_put(dev_out);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002431 dev_out = net->loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002432 dev_hold(dev_out);
2433 fl.oif = dev_out->ifindex;
2434 if (res.fi)
2435 fib_info_put(res.fi);
2436 res.fi = NULL;
2437 flags |= RTCF_LOCAL;
2438 goto make_route;
2439 }
2440
2441#ifdef CONFIG_IP_ROUTE_MULTIPATH
2442 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2443 fib_select_multipath(&fl, &res);
2444 else
2445#endif
2446 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002447 fib_select_default(net, &fl, &res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002448
2449 if (!fl.fl4_src)
2450 fl.fl4_src = FIB_RES_PREFSRC(res);
2451
2452 if (dev_out)
2453 dev_put(dev_out);
2454 dev_out = FIB_RES_DEV(res);
2455 dev_hold(dev_out);
2456 fl.oif = dev_out->ifindex;
2457
2458
2459make_route:
2460 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2461
2462
2463 if (free_res)
2464 fib_res_put(&res);
2465 if (dev_out)
2466 dev_put(dev_out);
2467out: return err;
2468}
2469
Denis V. Lunev611c1832008-01-22 22:06:48 -08002470int __ip_route_output_key(struct net *net, struct rtable **rp,
2471 const struct flowi *flp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002472{
2473 unsigned hash;
2474 struct rtable *rth;
2475
Al Viro8c7bc842006-09-26 21:26:19 -07002476 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002477
2478 rcu_read_lock_bh();
2479 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08002480 rth = rcu_dereference(rth->u.dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002481 if (rth->fl.fl4_dst == flp->fl4_dst &&
2482 rth->fl.fl4_src == flp->fl4_src &&
2483 rth->fl.iif == 0 &&
2484 rth->fl.oif == flp->oif &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002485 rth->fl.mark == flp->mark &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002486 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
Denis V. Lunevb5921912008-01-22 23:50:25 -08002487 (IPTOS_RT_MASK | RTO_ONLINK)) &&
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09002488 net_eq(dev_net(rth->u.dst.dev), net) &&
Eric Dumazet29e75252008-01-31 17:05:09 -08002489 rth->rt_genid == atomic_read(&rt_genid)) {
Pavel Emelyanov03f49f32007-11-10 21:28:34 -08002490 dst_use(&rth->u.dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002491 RT_CACHE_STAT_INC(out_hit);
2492 rcu_read_unlock_bh();
2493 *rp = rth;
2494 return 0;
2495 }
2496 RT_CACHE_STAT_INC(out_hlist_search);
2497 }
2498 rcu_read_unlock_bh();
2499
Denis V. Lunev611c1832008-01-22 22:06:48 -08002500 return ip_route_output_slow(net, rp, flp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002501}
2502
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002503EXPORT_SYMBOL_GPL(__ip_route_output_key);
2504
David S. Miller14e50e52007-05-24 18:17:54 -07002505static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2506{
2507}
2508
2509static struct dst_ops ipv4_dst_blackhole_ops = {
2510 .family = AF_INET,
2511 .protocol = __constant_htons(ETH_P_IP),
2512 .destroy = ipv4_dst_destroy,
2513 .check = ipv4_dst_check,
2514 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2515 .entry_size = sizeof(struct rtable),
Eric Dumazete2422972008-01-30 20:07:45 -08002516 .entries = ATOMIC_INIT(0),
David S. Miller14e50e52007-05-24 18:17:54 -07002517};
2518
2519
Denis V. Lunevce259992008-03-22 17:42:37 -07002520static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp)
David S. Miller14e50e52007-05-24 18:17:54 -07002521{
2522 struct rtable *ort = *rp;
2523 struct rtable *rt = (struct rtable *)
2524 dst_alloc(&ipv4_dst_blackhole_ops);
2525
2526 if (rt) {
2527 struct dst_entry *new = &rt->u.dst;
2528
2529 atomic_set(&new->__refcnt, 1);
2530 new->__use = 1;
Herbert Xu352e5122007-11-13 21:34:06 -08002531 new->input = dst_discard;
2532 new->output = dst_discard;
David S. Miller14e50e52007-05-24 18:17:54 -07002533 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2534
2535 new->dev = ort->u.dst.dev;
2536 if (new->dev)
2537 dev_hold(new->dev);
2538
2539 rt->fl = ort->fl;
2540
2541 rt->idev = ort->idev;
2542 if (rt->idev)
2543 in_dev_hold(rt->idev);
Eric Dumazet29e75252008-01-31 17:05:09 -08002544 rt->rt_genid = atomic_read(&rt_genid);
David S. Miller14e50e52007-05-24 18:17:54 -07002545 rt->rt_flags = ort->rt_flags;
2546 rt->rt_type = ort->rt_type;
2547 rt->rt_dst = ort->rt_dst;
2548 rt->rt_src = ort->rt_src;
2549 rt->rt_iif = ort->rt_iif;
2550 rt->rt_gateway = ort->rt_gateway;
2551 rt->rt_spec_dst = ort->rt_spec_dst;
2552 rt->peer = ort->peer;
2553 if (rt->peer)
2554 atomic_inc(&rt->peer->refcnt);
2555
2556 dst_free(new);
2557 }
2558
2559 dst_release(&(*rp)->u.dst);
2560 *rp = rt;
2561 return (rt ? 0 : -ENOMEM);
2562}
2563
Denis V. Lunevf1b050b2008-01-22 22:07:10 -08002564int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2565 struct sock *sk, int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002566{
2567 int err;
2568
Denis V. Lunevf1b050b2008-01-22 22:07:10 -08002569 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002570 return err;
2571
2572 if (flp->proto) {
2573 if (!flp->fl4_src)
2574 flp->fl4_src = (*rp)->rt_src;
2575 if (!flp->fl4_dst)
2576 flp->fl4_dst = (*rp)->rt_dst;
Herbert Xubb728452007-12-12 18:48:58 -08002577 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2578 flags ? XFRM_LOOKUP_WAIT : 0);
David S. Miller14e50e52007-05-24 18:17:54 -07002579 if (err == -EREMOTE)
Denis V. Lunevce259992008-03-22 17:42:37 -07002580 err = ipv4_dst_blackhole(rp, flp);
David S. Miller14e50e52007-05-24 18:17:54 -07002581
2582 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002583 }
2584
2585 return 0;
2586}
2587
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002588EXPORT_SYMBOL_GPL(ip_route_output_flow);
2589
Denis V. Lunevf2063512008-01-22 22:07:34 -08002590int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002591{
Denis V. Lunevf2063512008-01-22 22:07:34 -08002592 return ip_route_output_flow(net, rp, flp, NULL, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002593}
2594
2595static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002596 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002597{
Eric Dumazetee6b9672008-03-05 18:30:47 -08002598 struct rtable *rt = skb->rtable;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002599 struct rtmsg *r;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002600 struct nlmsghdr *nlh;
Thomas Grafe3703b32006-11-27 09:27:07 -08002601 long expires;
2602 u32 id = 0, ts = 0, tsage = 0, error;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002603
2604 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2605 if (nlh == NULL)
Patrick McHardy26932562007-01-31 23:16:40 -08002606 return -EMSGSIZE;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002607
2608 r = nlmsg_data(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002609 r->rtm_family = AF_INET;
2610 r->rtm_dst_len = 32;
2611 r->rtm_src_len = 0;
2612 r->rtm_tos = rt->fl.fl4_tos;
2613 r->rtm_table = RT_TABLE_MAIN;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002614 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002615 r->rtm_type = rt->rt_type;
2616 r->rtm_scope = RT_SCOPE_UNIVERSE;
2617 r->rtm_protocol = RTPROT_UNSPEC;
2618 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2619 if (rt->rt_flags & RTCF_NOTIFY)
2620 r->rtm_flags |= RTM_F_NOTIFY;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002621
Al Viro17fb2c62006-09-26 22:15:25 -07002622 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002623
Linus Torvalds1da177e2005-04-16 15:20:36 -07002624 if (rt->fl.fl4_src) {
2625 r->rtm_src_len = 32;
Al Viro17fb2c62006-09-26 22:15:25 -07002626 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002627 }
2628 if (rt->u.dst.dev)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002629 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002630#ifdef CONFIG_NET_CLS_ROUTE
2631 if (rt->u.dst.tclassid)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002632 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002633#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002634 if (rt->fl.iif)
Al Viro17fb2c62006-09-26 22:15:25 -07002635 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002636 else if (rt->rt_src != rt->fl.fl4_src)
Al Viro17fb2c62006-09-26 22:15:25 -07002637 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002638
Linus Torvalds1da177e2005-04-16 15:20:36 -07002639 if (rt->rt_dst != rt->rt_gateway)
Al Viro17fb2c62006-09-26 22:15:25 -07002640 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002641
Linus Torvalds1da177e2005-04-16 15:20:36 -07002642 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002643 goto nla_put_failure;
2644
Thomas Grafe3703b32006-11-27 09:27:07 -08002645 error = rt->u.dst.error;
2646 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002647 if (rt->peer) {
Thomas Grafe3703b32006-11-27 09:27:07 -08002648 id = rt->peer->ip_id_count;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002649 if (rt->peer->tcp_ts_stamp) {
Thomas Grafe3703b32006-11-27 09:27:07 -08002650 ts = rt->peer->tcp_ts;
James Morris9d729f72007-03-04 16:12:44 -08002651 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002652 }
2653 }
Thomas Grafbe403ea2006-08-17 18:15:17 -07002654
Linus Torvalds1da177e2005-04-16 15:20:36 -07002655 if (rt->fl.iif) {
2656#ifdef CONFIG_IP_MROUTE
Al Viroe4485152006-09-26 22:15:01 -07002657 __be32 dst = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002658
Joe Perchesf97c1e02007-12-16 13:45:43 -08002659 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
Pavel Emelyanov586f1212007-12-16 13:32:48 -08002660 IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002661 int err = ipmr_get_route(skb, r, nowait);
2662 if (err <= 0) {
2663 if (!nowait) {
2664 if (err == 0)
2665 return 0;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002666 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002667 } else {
2668 if (err == -EMSGSIZE)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002669 goto nla_put_failure;
Thomas Grafe3703b32006-11-27 09:27:07 -08002670 error = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002671 }
2672 }
2673 } else
2674#endif
Thomas Grafbe403ea2006-08-17 18:15:17 -07002675 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002676 }
2677
Thomas Grafe3703b32006-11-27 09:27:07 -08002678 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2679 expires, error) < 0)
2680 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002681
Thomas Grafbe403ea2006-08-17 18:15:17 -07002682 return nlmsg_end(skb, nlh);
2683
2684nla_put_failure:
Patrick McHardy26932562007-01-31 23:16:40 -08002685 nlmsg_cancel(skb, nlh);
2686 return -EMSGSIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002687}
2688
Thomas Graf63f34442007-03-22 11:55:17 -07002689static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002690{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09002691 struct net *net = sock_net(in_skb->sk);
Thomas Grafd889ce32006-08-17 18:15:44 -07002692 struct rtmsg *rtm;
2693 struct nlattr *tb[RTA_MAX+1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07002694 struct rtable *rt = NULL;
Al Viro9e12bb22006-09-26 21:25:20 -07002695 __be32 dst = 0;
2696 __be32 src = 0;
2697 u32 iif;
Thomas Grafd889ce32006-08-17 18:15:44 -07002698 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002699 struct sk_buff *skb;
2700
Thomas Grafd889ce32006-08-17 18:15:44 -07002701 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2702 if (err < 0)
2703 goto errout;
2704
2705 rtm = nlmsg_data(nlh);
2706
Linus Torvalds1da177e2005-04-16 15:20:36 -07002707 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
Thomas Grafd889ce32006-08-17 18:15:44 -07002708 if (skb == NULL) {
2709 err = -ENOBUFS;
2710 goto errout;
2711 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002712
2713 /* Reserve room for dummy headers, this skb can pass
2714 through good chunk of routing engine.
2715 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07002716 skb_reset_mac_header(skb);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07002717 skb_reset_network_header(skb);
Stephen Hemmingerd2c962b2006-04-17 17:27:11 -07002718
2719 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07002720 ip_hdr(skb)->protocol = IPPROTO_ICMP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002721 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2722
Al Viro17fb2c62006-09-26 22:15:25 -07002723 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2724 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
Thomas Grafd889ce32006-08-17 18:15:44 -07002725 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002726
2727 if (iif) {
Thomas Grafd889ce32006-08-17 18:15:44 -07002728 struct net_device *dev;
2729
Denis V. Lunev19375042008-02-28 20:52:04 -08002730 dev = __dev_get_by_index(net, iif);
Thomas Grafd889ce32006-08-17 18:15:44 -07002731 if (dev == NULL) {
2732 err = -ENODEV;
2733 goto errout_free;
2734 }
2735
Linus Torvalds1da177e2005-04-16 15:20:36 -07002736 skb->protocol = htons(ETH_P_IP);
2737 skb->dev = dev;
2738 local_bh_disable();
2739 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2740 local_bh_enable();
Thomas Grafd889ce32006-08-17 18:15:44 -07002741
Eric Dumazetee6b9672008-03-05 18:30:47 -08002742 rt = skb->rtable;
Thomas Grafd889ce32006-08-17 18:15:44 -07002743 if (err == 0 && rt->u.dst.error)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002744 err = -rt->u.dst.error;
2745 } else {
Thomas Grafd889ce32006-08-17 18:15:44 -07002746 struct flowi fl = {
2747 .nl_u = {
2748 .ip4_u = {
2749 .daddr = dst,
2750 .saddr = src,
2751 .tos = rtm->rtm_tos,
2752 },
2753 },
2754 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2755 };
Denis V. Lunev19375042008-02-28 20:52:04 -08002756 err = ip_route_output_key(net, &rt, &fl);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002757 }
Thomas Grafd889ce32006-08-17 18:15:44 -07002758
Linus Torvalds1da177e2005-04-16 15:20:36 -07002759 if (err)
Thomas Grafd889ce32006-08-17 18:15:44 -07002760 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002761
Eric Dumazetee6b9672008-03-05 18:30:47 -08002762 skb->rtable = rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002763 if (rtm->rtm_flags & RTM_F_NOTIFY)
2764 rt->rt_flags |= RTCF_NOTIFY;
2765
Linus Torvalds1da177e2005-04-16 15:20:36 -07002766 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Denis V. Lunev19375042008-02-28 20:52:04 -08002767 RTM_NEWROUTE, 0, 0);
Thomas Grafd889ce32006-08-17 18:15:44 -07002768 if (err <= 0)
2769 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002770
Denis V. Lunev19375042008-02-28 20:52:04 -08002771 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
Thomas Grafd889ce32006-08-17 18:15:44 -07002772errout:
Thomas Graf2942e902006-08-15 00:30:25 -07002773 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002774
Thomas Grafd889ce32006-08-17 18:15:44 -07002775errout_free:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002776 kfree_skb(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07002777 goto errout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002778}
2779
2780int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2781{
2782 struct rtable *rt;
2783 int h, s_h;
2784 int idx, s_idx;
Denis V. Lunev19375042008-02-28 20:52:04 -08002785 struct net *net;
2786
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09002787 net = sock_net(skb->sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002788
2789 s_h = cb->args[0];
Eric Dumazetd8c92832008-01-07 21:52:14 -08002790 if (s_h < 0)
2791 s_h = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002792 s_idx = idx = cb->args[1];
Eric Dumazetd8c92832008-01-07 21:52:14 -08002793 for (h = s_h; h <= rt_hash_mask; h++) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002794 rcu_read_lock_bh();
2795 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08002796 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09002797 if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002798 continue;
Eric Dumazet29e75252008-01-31 17:05:09 -08002799 if (rt->rt_genid != atomic_read(&rt_genid))
2800 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002801 skb->dst = dst_clone(&rt->u.dst);
2802 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002803 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002804 1, NLM_F_MULTI) <= 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002805 dst_release(xchg(&skb->dst, NULL));
2806 rcu_read_unlock_bh();
2807 goto done;
2808 }
2809 dst_release(xchg(&skb->dst, NULL));
2810 }
2811 rcu_read_unlock_bh();
Eric Dumazetd8c92832008-01-07 21:52:14 -08002812 s_idx = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002813 }
2814
2815done:
2816 cb->args[0] = h;
2817 cb->args[1] = idx;
2818 return skb->len;
2819}
2820
2821void ip_rt_multicast_event(struct in_device *in_dev)
2822{
2823 rt_cache_flush(0);
2824}
2825
2826#ifdef CONFIG_SYSCTL
2827static int flush_delay;
2828
2829static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2830 struct file *filp, void __user *buffer,
2831 size_t *lenp, loff_t *ppos)
2832{
2833 if (write) {
2834 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2835 rt_cache_flush(flush_delay);
2836 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002837 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002838
2839 return -EINVAL;
2840}
2841
2842static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2843 int __user *name,
2844 int nlen,
2845 void __user *oldval,
2846 size_t __user *oldlenp,
2847 void __user *newval,
Alexey Dobriyan1f29bcd2006-12-10 02:19:10 -08002848 size_t newlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002849{
2850 int delay;
2851 if (newlen != sizeof(int))
2852 return -EINVAL;
2853 if (get_user(delay, (int __user *)newval))
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002854 return -EFAULT;
2855 rt_cache_flush(delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002856 return 0;
2857}
2858
2859ctl_table ipv4_route_table[] = {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002860 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002861 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2862 .procname = "flush",
2863 .data = &flush_delay,
2864 .maxlen = sizeof(int),
Dave Jones7e3e0362005-04-28 12:11:03 -07002865 .mode = 0200,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002866 .proc_handler = &ipv4_sysctl_rtcache_flush,
2867 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2868 },
2869 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002870 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2871 .procname = "gc_thresh",
2872 .data = &ipv4_dst_ops.gc_thresh,
2873 .maxlen = sizeof(int),
2874 .mode = 0644,
2875 .proc_handler = &proc_dointvec,
2876 },
2877 {
2878 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2879 .procname = "max_size",
2880 .data = &ip_rt_max_size,
2881 .maxlen = sizeof(int),
2882 .mode = 0644,
2883 .proc_handler = &proc_dointvec,
2884 },
2885 {
2886 /* Deprecated. Use gc_min_interval_ms */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002887
Linus Torvalds1da177e2005-04-16 15:20:36 -07002888 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2889 .procname = "gc_min_interval",
2890 .data = &ip_rt_gc_min_interval,
2891 .maxlen = sizeof(int),
2892 .mode = 0644,
2893 .proc_handler = &proc_dointvec_jiffies,
2894 .strategy = &sysctl_jiffies,
2895 },
2896 {
2897 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2898 .procname = "gc_min_interval_ms",
2899 .data = &ip_rt_gc_min_interval,
2900 .maxlen = sizeof(int),
2901 .mode = 0644,
2902 .proc_handler = &proc_dointvec_ms_jiffies,
2903 .strategy = &sysctl_ms_jiffies,
2904 },
2905 {
2906 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2907 .procname = "gc_timeout",
2908 .data = &ip_rt_gc_timeout,
2909 .maxlen = sizeof(int),
2910 .mode = 0644,
2911 .proc_handler = &proc_dointvec_jiffies,
2912 .strategy = &sysctl_jiffies,
2913 },
2914 {
2915 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2916 .procname = "gc_interval",
2917 .data = &ip_rt_gc_interval,
2918 .maxlen = sizeof(int),
2919 .mode = 0644,
2920 .proc_handler = &proc_dointvec_jiffies,
2921 .strategy = &sysctl_jiffies,
2922 },
2923 {
2924 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2925 .procname = "redirect_load",
2926 .data = &ip_rt_redirect_load,
2927 .maxlen = sizeof(int),
2928 .mode = 0644,
2929 .proc_handler = &proc_dointvec,
2930 },
2931 {
2932 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2933 .procname = "redirect_number",
2934 .data = &ip_rt_redirect_number,
2935 .maxlen = sizeof(int),
2936 .mode = 0644,
2937 .proc_handler = &proc_dointvec,
2938 },
2939 {
2940 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2941 .procname = "redirect_silence",
2942 .data = &ip_rt_redirect_silence,
2943 .maxlen = sizeof(int),
2944 .mode = 0644,
2945 .proc_handler = &proc_dointvec,
2946 },
2947 {
2948 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2949 .procname = "error_cost",
2950 .data = &ip_rt_error_cost,
2951 .maxlen = sizeof(int),
2952 .mode = 0644,
2953 .proc_handler = &proc_dointvec,
2954 },
2955 {
2956 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2957 .procname = "error_burst",
2958 .data = &ip_rt_error_burst,
2959 .maxlen = sizeof(int),
2960 .mode = 0644,
2961 .proc_handler = &proc_dointvec,
2962 },
2963 {
2964 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
2965 .procname = "gc_elasticity",
2966 .data = &ip_rt_gc_elasticity,
2967 .maxlen = sizeof(int),
2968 .mode = 0644,
2969 .proc_handler = &proc_dointvec,
2970 },
2971 {
2972 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
2973 .procname = "mtu_expires",
2974 .data = &ip_rt_mtu_expires,
2975 .maxlen = sizeof(int),
2976 .mode = 0644,
2977 .proc_handler = &proc_dointvec_jiffies,
2978 .strategy = &sysctl_jiffies,
2979 },
2980 {
2981 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
2982 .procname = "min_pmtu",
2983 .data = &ip_rt_min_pmtu,
2984 .maxlen = sizeof(int),
2985 .mode = 0644,
2986 .proc_handler = &proc_dointvec,
2987 },
2988 {
2989 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
2990 .procname = "min_adv_mss",
2991 .data = &ip_rt_min_advmss,
2992 .maxlen = sizeof(int),
2993 .mode = 0644,
2994 .proc_handler = &proc_dointvec,
2995 },
2996 {
2997 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
2998 .procname = "secret_interval",
2999 .data = &ip_rt_secret_interval,
3000 .maxlen = sizeof(int),
3001 .mode = 0644,
3002 .proc_handler = &proc_dointvec_jiffies,
3003 .strategy = &sysctl_jiffies,
3004 },
3005 { .ctl_name = 0 }
3006};
3007#endif
3008
3009#ifdef CONFIG_NET_CLS_ROUTE
Eric Dumazet8dbde282007-11-16 03:32:10 -08003010struct ip_rt_acct *ip_rt_acct __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003011#endif /* CONFIG_NET_CLS_ROUTE */
3012
3013static __initdata unsigned long rhash_entries;
3014static int __init set_rhash_entries(char *str)
3015{
3016 if (!str)
3017 return 0;
3018 rhash_entries = simple_strtoul(str, &str, 0);
3019 return 1;
3020}
3021__setup("rhash_entries=", set_rhash_entries);
3022
3023int __init ip_rt_init(void)
3024{
Eric Dumazet424c4b72005-07-05 14:58:19 -07003025 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003026
Eric Dumazet29e75252008-01-31 17:05:09 -08003027 atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^
3028 (jiffies ^ (jiffies >> 7))));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003029
3030#ifdef CONFIG_NET_CLS_ROUTE
Eric Dumazet8dbde282007-11-16 03:32:10 -08003031 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003032 if (!ip_rt_acct)
3033 panic("IP: failed to allocate ip_rt_acct\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003034#endif
3035
Alexey Dobriyane5d679f332006-08-26 19:25:52 -07003036 ipv4_dst_ops.kmem_cachep =
3037 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09003038 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003039
David S. Miller14e50e52007-05-24 18:17:54 -07003040 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3041
Eric Dumazet424c4b72005-07-05 14:58:19 -07003042 rt_hash_table = (struct rt_hash_bucket *)
3043 alloc_large_system_hash("IP route cache",
3044 sizeof(struct rt_hash_bucket),
3045 rhash_entries,
3046 (num_physpages >= 128 * 1024) ?
Mike Stroyan18955cf2005-11-29 16:12:55 -08003047 15 : 17,
Kirill Korotaev8d1502d2006-08-07 20:44:22 -07003048 0,
Eric Dumazet424c4b72005-07-05 14:58:19 -07003049 &rt_hash_log,
3050 &rt_hash_mask,
3051 0);
Eric Dumazet22c047c2005-07-05 14:55:24 -07003052 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3053 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003054
3055 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3056 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3057
Linus Torvalds1da177e2005-04-16 15:20:36 -07003058 devinet_init();
3059 ip_fib_init();
3060
Stephen Hemminger2fa75272008-04-10 01:55:27 -07003061 rt_secret_timer.function = rt_secret_rebuild;
3062 rt_secret_timer.data = 0;
3063 init_timer_deferrable(&rt_secret_timer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003064
3065 /* All the timers, started at system startup tend
3066 to synchronize. Perturb it a bit.
3067 */
Eric Dumazet39c90ec2007-09-15 10:55:54 -07003068 schedule_delayed_work(&expires_work,
3069 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003070
3071 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3072 ip_rt_secret_interval;
3073 add_timer(&rt_secret_timer);
3074
Denis V. Lunev73b38712008-02-28 20:51:18 -08003075 if (ip_rt_proc_init())
Pavel Emelyanov107f1632007-12-05 21:14:28 -08003076 printk(KERN_ERR "Unable to create route proc files\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003077#ifdef CONFIG_XFRM
3078 xfrm_init();
3079 xfrm4_init();
3080#endif
Thomas Graf63f34442007-03-22 11:55:17 -07003081 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3082
Linus Torvalds1da177e2005-04-16 15:20:36 -07003083 return rc;
3084}
3085
3086EXPORT_SYMBOL(__ip_select_ident);
3087EXPORT_SYMBOL(ip_route_input);
3088EXPORT_SYMBOL(ip_route_output_key);