blob: 9725223ffe9d2d45083fe8182f191ef498aa0032 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
Jesper Juhl02c30a82005-05-05 16:16:16 -07008 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -07009 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090021 * Alan Cox : Super /proc >4K
Linus Torvalds1da177e2005-04-16 15:20:36 -070022 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090039 *
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
Eric Dumazetbb1d23b2005-07-05 15:00:32 -070055 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
Ilia Sotnikovcef26852006-03-25 01:38:55 -080056 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
Linus Torvalds1da177e2005-04-16 15:20:36 -070058 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
Linus Torvalds1da177e2005-04-16 15:20:36 -070065#include <linux/module.h>
66#include <asm/uaccess.h>
67#include <asm/system.h>
68#include <linux/bitops.h>
69#include <linux/types.h>
70#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070071#include <linux/mm.h>
Eric Dumazet424c4b72005-07-05 14:58:19 -070072#include <linux/bootmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070073#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
Eric Dumazet39c90ec2007-09-15 10:55:54 -070082#include <linux/workqueue.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070083#include <linux/skbuff.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070084#include <linux/inetdevice.h>
85#include <linux/igmp.h>
86#include <linux/pkt_sched.h>
87#include <linux/mroute.h>
88#include <linux/netfilter_ipv4.h>
89#include <linux/random.h>
90#include <linux/jhash.h>
91#include <linux/rcupdate.h>
92#include <linux/times.h>
Herbert Xu352e5122007-11-13 21:34:06 -080093#include <net/dst.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020094#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070095#include <net/protocol.h>
96#include <net/ip.h>
97#include <net/route.h>
98#include <net/inetpeer.h>
99#include <net/sock.h>
100#include <net/ip_fib.h>
101#include <net/arp.h>
102#include <net/tcp.h>
103#include <net/icmp.h>
104#include <net/xfrm.h>
Tom Tucker8d717402006-07-30 20:43:36 -0700105#include <net/netevent.h>
Thomas Graf63f34442007-03-22 11:55:17 -0700106#include <net/rtnetlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700107#ifdef CONFIG_SYSCTL
108#include <linux/sysctl.h>
109#endif
110
111#define RT_FL_TOS(oldflp) \
112 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113
114#define IP_MAX_MTU 0xFFF0
115
116#define RT_GC_TIMEOUT (300*HZ)
117
Linus Torvalds1da177e2005-04-16 15:20:36 -0700118static int ip_rt_max_size;
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700119static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
120static int ip_rt_gc_interval __read_mostly = 60 * HZ;
121static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
122static int ip_rt_redirect_number __read_mostly = 9;
123static int ip_rt_redirect_load __read_mostly = HZ / 50;
124static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
125static int ip_rt_error_cost __read_mostly = HZ;
126static int ip_rt_error_burst __read_mostly = 5 * HZ;
127static int ip_rt_gc_elasticity __read_mostly = 8;
128static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
129static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
130static int ip_rt_min_advmss __read_mostly = 256;
131static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700132
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800133static void rt_worker_func(struct work_struct *work);
134static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700135
136/*
137 * Interface to generic destination cache.
138 */
139
140static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
141static void ipv4_dst_destroy(struct dst_entry *dst);
142static void ipv4_dst_ifdown(struct dst_entry *dst,
143 struct net_device *dev, int how);
144static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
145static void ipv4_link_failure(struct sk_buff *skb);
146static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
Daniel Lezcano569d3642008-01-18 03:56:57 -0800147static int rt_garbage_collect(struct dst_ops *ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148
149
150static struct dst_ops ipv4_dst_ops = {
151 .family = AF_INET,
152 .protocol = __constant_htons(ETH_P_IP),
153 .gc = rt_garbage_collect,
154 .check = ipv4_dst_check,
155 .destroy = ipv4_dst_destroy,
156 .ifdown = ipv4_dst_ifdown,
157 .negative_advice = ipv4_negative_advice,
158 .link_failure = ipv4_link_failure,
159 .update_pmtu = ip_rt_update_pmtu,
Herbert Xu1ac06e02008-05-20 14:32:14 -0700160 .local_out = __ip_local_out,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700161 .entry_size = sizeof(struct rtable),
Eric Dumazete2422972008-01-30 20:07:45 -0800162 .entries = ATOMIC_INIT(0),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700163};
164
165#define ECN_OR_COST(class) TC_PRIO_##class
166
Philippe De Muyter4839c522007-07-09 15:32:57 -0700167const __u8 ip_tos2prio[16] = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700168 TC_PRIO_BESTEFFORT,
169 ECN_OR_COST(FILLER),
170 TC_PRIO_BESTEFFORT,
171 ECN_OR_COST(BESTEFFORT),
172 TC_PRIO_BULK,
173 ECN_OR_COST(BULK),
174 TC_PRIO_BULK,
175 ECN_OR_COST(BULK),
176 TC_PRIO_INTERACTIVE,
177 ECN_OR_COST(INTERACTIVE),
178 TC_PRIO_INTERACTIVE,
179 ECN_OR_COST(INTERACTIVE),
180 TC_PRIO_INTERACTIVE_BULK,
181 ECN_OR_COST(INTERACTIVE_BULK),
182 TC_PRIO_INTERACTIVE_BULK,
183 ECN_OR_COST(INTERACTIVE_BULK)
184};
185
186
187/*
188 * Route cache.
189 */
190
191/* The locking scheme is rather straight forward:
192 *
193 * 1) Read-Copy Update protects the buckets of the central route hash.
194 * 2) Only writers remove entries, and they hold the lock
195 * as they look at rtable reference counts.
196 * 3) Only readers acquire references to rtable entries,
197 * they do so with atomic increments and with the
198 * lock held.
199 */
200
201struct rt_hash_bucket {
202 struct rtable *chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700203};
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700204#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
205 defined(CONFIG_PROVE_LOCKING)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700206/*
207 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
208 * The size of this table is a power of two and depends on the number of CPUS.
Ingo Molnar62051202006-07-03 00:24:59 -0700209 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
Eric Dumazet22c047c2005-07-05 14:55:24 -0700210 */
Ingo Molnar62051202006-07-03 00:24:59 -0700211#ifdef CONFIG_LOCKDEP
212# define RT_HASH_LOCK_SZ 256
Eric Dumazet22c047c2005-07-05 14:55:24 -0700213#else
Ingo Molnar62051202006-07-03 00:24:59 -0700214# if NR_CPUS >= 32
215# define RT_HASH_LOCK_SZ 4096
216# elif NR_CPUS >= 16
217# define RT_HASH_LOCK_SZ 2048
218# elif NR_CPUS >= 8
219# define RT_HASH_LOCK_SZ 1024
220# elif NR_CPUS >= 4
221# define RT_HASH_LOCK_SZ 512
222# else
223# define RT_HASH_LOCK_SZ 256
224# endif
Eric Dumazet22c047c2005-07-05 14:55:24 -0700225#endif
226
227static spinlock_t *rt_hash_locks;
228# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800229
230static __init void rt_hash_lock_init(void)
231{
232 int i;
233
234 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
235 GFP_KERNEL);
236 if (!rt_hash_locks)
237 panic("IP: failed to allocate rt_hash_locks\n");
238
239 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
240 spin_lock_init(&rt_hash_locks[i]);
241}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700242#else
243# define rt_hash_lock_addr(slot) NULL
Pavel Emelyanov1ff1cc22007-12-05 21:15:05 -0800244
245static inline void rt_hash_lock_init(void)
246{
247}
Eric Dumazet22c047c2005-07-05 14:55:24 -0700248#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700249
Stephen Hemminger817bc4d2008-03-22 17:43:59 -0700250static struct rt_hash_bucket *rt_hash_table __read_mostly;
251static unsigned rt_hash_mask __read_mostly;
252static unsigned int rt_hash_log __read_mostly;
253static atomic_t rt_genid __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700254
Eric Dumazet2f970d82006-01-17 02:54:36 -0800255static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
Andrew Mortondbd29152006-01-17 21:58:01 -0800256#define RT_CACHE_STAT_INC(field) \
Paul Mackerrasbfe5d832006-06-25 05:47:14 -0700257 (__raw_get_cpu_var(rt_cache_stat).field++)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700258
Stephen Hemminger1294fc42008-04-10 01:54:01 -0700259static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700260{
Stephen Hemminger1294fc42008-04-10 01:54:01 -0700261 return jhash_3words((__force u32)(__be32)(daddr),
262 (__force u32)(__be32)(saddr),
263 idx, atomic_read(&rt_genid))
Eric Dumazet29e75252008-01-31 17:05:09 -0800264 & rt_hash_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700265}
266
267#ifdef CONFIG_PROC_FS
268struct rt_cache_iter_state {
Denis V. Luneva75e9362008-02-28 20:50:55 -0800269 struct seq_net_private p;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700270 int bucket;
Eric Dumazet29e75252008-01-31 17:05:09 -0800271 int genid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700272};
273
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900274static struct rtable *rt_cache_get_first(struct seq_file *seq)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700275{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900276 struct rt_cache_iter_state *st = seq->private;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700277 struct rtable *r = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700278
279 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
280 rcu_read_lock_bh();
Eric Dumazet29e75252008-01-31 17:05:09 -0800281 r = rcu_dereference(rt_hash_table[st->bucket].chain);
282 while (r) {
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900283 if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
Denis V. Luneva75e9362008-02-28 20:50:55 -0800284 r->rt_genid == st->genid)
Eric Dumazet29e75252008-01-31 17:05:09 -0800285 return r;
286 r = rcu_dereference(r->u.dst.rt_next);
287 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700288 rcu_read_unlock_bh();
289 }
Eric Dumazet29e75252008-01-31 17:05:09 -0800290 return r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700291}
292
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900293static struct rtable *__rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800294 struct rtable *r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700295{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900296 struct rt_cache_iter_state *st = seq->private;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800297 r = r->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700298 while (!r) {
299 rcu_read_unlock_bh();
300 if (--st->bucket < 0)
301 break;
302 rcu_read_lock_bh();
303 r = rt_hash_table[st->bucket].chain;
304 }
Eric Dumazet0bccead2008-01-10 03:55:57 -0800305 return rcu_dereference(r);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700306}
307
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900308static struct rtable *rt_cache_get_next(struct seq_file *seq,
Denis V. Lunev642d6312008-02-28 20:50:33 -0800309 struct rtable *r)
310{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900311 struct rt_cache_iter_state *st = seq->private;
312 while ((r = __rt_cache_get_next(seq, r)) != NULL) {
313 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
Denis V. Luneva75e9362008-02-28 20:50:55 -0800314 continue;
Denis V. Lunev642d6312008-02-28 20:50:33 -0800315 if (r->rt_genid == st->genid)
316 break;
317 }
318 return r;
319}
320
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900321static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700322{
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900323 struct rtable *r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700324
325 if (r)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900326 while (pos && (r = rt_cache_get_next(seq, r)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700327 --pos;
328 return pos ? NULL : r;
329}
330
331static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
332{
Eric Dumazet29e75252008-01-31 17:05:09 -0800333 struct rt_cache_iter_state *st = seq->private;
Eric Dumazet29e75252008-01-31 17:05:09 -0800334 if (*pos)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900335 return rt_cache_get_idx(seq, *pos - 1);
Eric Dumazet29e75252008-01-31 17:05:09 -0800336 st->genid = atomic_read(&rt_genid);
337 return SEQ_START_TOKEN;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700338}
339
340static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
341{
Eric Dumazet29e75252008-01-31 17:05:09 -0800342 struct rtable *r;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700343
344 if (v == SEQ_START_TOKEN)
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900345 r = rt_cache_get_first(seq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700346 else
YOSHIFUJI Hideaki12188542008-03-26 02:36:06 +0900347 r = rt_cache_get_next(seq, v);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700348 ++*pos;
349 return r;
350}
351
352static void rt_cache_seq_stop(struct seq_file *seq, void *v)
353{
354 if (v && v != SEQ_START_TOKEN)
355 rcu_read_unlock_bh();
356}
357
358static int rt_cache_seq_show(struct seq_file *seq, void *v)
359{
360 if (v == SEQ_START_TOKEN)
361 seq_printf(seq, "%-127s\n",
362 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
363 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
364 "HHUptod\tSpecDst");
365 else {
366 struct rtable *r = v;
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700367 int len;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700368
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700369 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
370 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700371 r->u.dst.dev ? r->u.dst.dev->name : "*",
372 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
373 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
374 r->u.dst.__use, 0, (unsigned long)r->rt_src,
375 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
376 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
377 dst_metric(&r->u.dst, RTAX_WINDOW),
378 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
379 dst_metric(&r->u.dst, RTAX_RTTVAR)),
380 r->fl.fl4_tos,
381 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
382 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
383 dev_queue_xmit) : 0,
Pavel Emelyanov5e659e42008-04-24 01:02:16 -0700384 r->rt_spec_dst, &len);
385
386 seq_printf(seq, "%*s\n", 127 - len, "");
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900387 }
388 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700389}
390
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700391static const struct seq_operations rt_cache_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700392 .start = rt_cache_seq_start,
393 .next = rt_cache_seq_next,
394 .stop = rt_cache_seq_stop,
395 .show = rt_cache_seq_show,
396};
397
398static int rt_cache_seq_open(struct inode *inode, struct file *file)
399{
Denis V. Luneva75e9362008-02-28 20:50:55 -0800400 return seq_open_net(inode, file, &rt_cache_seq_ops,
Pavel Emelyanovcf7732e2007-10-10 02:29:29 -0700401 sizeof(struct rt_cache_iter_state));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700402}
403
Arjan van de Ven9a321442007-02-12 00:55:35 -0800404static const struct file_operations rt_cache_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700405 .owner = THIS_MODULE,
406 .open = rt_cache_seq_open,
407 .read = seq_read,
408 .llseek = seq_lseek,
Denis V. Luneva75e9362008-02-28 20:50:55 -0800409 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700410};
411
412
413static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
414{
415 int cpu;
416
417 if (*pos == 0)
418 return SEQ_START_TOKEN;
419
420 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
421 if (!cpu_possible(cpu))
422 continue;
423 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800424 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700425 }
426 return NULL;
427}
428
429static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
430{
431 int cpu;
432
433 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
434 if (!cpu_possible(cpu))
435 continue;
436 *pos = cpu+1;
Eric Dumazet2f970d82006-01-17 02:54:36 -0800437 return &per_cpu(rt_cache_stat, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700438 }
439 return NULL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900440
Linus Torvalds1da177e2005-04-16 15:20:36 -0700441}
442
443static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
444{
445
446}
447
448static int rt_cpu_seq_show(struct seq_file *seq, void *v)
449{
450 struct rt_cache_stat *st = v;
451
452 if (v == SEQ_START_TOKEN) {
Olaf Rempel5bec0032005-04-28 12:16:08 -0700453 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700454 return 0;
455 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900456
Linus Torvalds1da177e2005-04-16 15:20:36 -0700457 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
458 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
459 atomic_read(&ipv4_dst_ops.entries),
460 st->in_hit,
461 st->in_slow_tot,
462 st->in_slow_mc,
463 st->in_no_route,
464 st->in_brd,
465 st->in_martian_dst,
466 st->in_martian_src,
467
468 st->out_hit,
469 st->out_slow_tot,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900470 st->out_slow_mc,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700471
472 st->gc_total,
473 st->gc_ignored,
474 st->gc_goal_miss,
475 st->gc_dst_overflow,
476 st->in_hlist_search,
477 st->out_hlist_search
478 );
479 return 0;
480}
481
Stephen Hemmingerf6908082007-03-12 14:34:29 -0700482static const struct seq_operations rt_cpu_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700483 .start = rt_cpu_seq_start,
484 .next = rt_cpu_seq_next,
485 .stop = rt_cpu_seq_stop,
486 .show = rt_cpu_seq_show,
487};
488
489
490static int rt_cpu_seq_open(struct inode *inode, struct file *file)
491{
492 return seq_open(file, &rt_cpu_seq_ops);
493}
494
Arjan van de Ven9a321442007-02-12 00:55:35 -0800495static const struct file_operations rt_cpu_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700496 .owner = THIS_MODULE,
497 .open = rt_cpu_seq_open,
498 .read = seq_read,
499 .llseek = seq_lseek,
500 .release = seq_release,
501};
502
Pavel Emelyanov78c686e2007-12-05 21:13:48 -0800503#ifdef CONFIG_NET_CLS_ROUTE
504static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
505 int length, int *eof, void *data)
506{
507 unsigned int i;
508
509 if ((offset & 3) || (length & 3))
510 return -EIO;
511
512 if (offset >= sizeof(struct ip_rt_acct) * 256) {
513 *eof = 1;
514 return 0;
515 }
516
517 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
518 length = sizeof(struct ip_rt_acct) * 256 - offset;
519 *eof = 1;
520 }
521
522 offset /= sizeof(u32);
523
524 if (length > 0) {
525 u32 *dst = (u32 *) buffer;
526
527 *start = buffer;
528 memset(dst, 0, length);
529
530 for_each_possible_cpu(i) {
531 unsigned int j;
532 u32 *src;
533
534 src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
535 for (j = 0; j < length/4; j++)
536 dst[j] += src[j];
537 }
538 }
539 return length;
540}
541#endif
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800542
Denis V. Lunev73b38712008-02-28 20:51:18 -0800543static int __net_init ip_rt_do_proc_init(struct net *net)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800544{
545 struct proc_dir_entry *pde;
546
547 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
548 &rt_cache_seq_fops);
549 if (!pde)
550 goto err1;
551
Wang Chen77020722008-02-28 14:14:25 -0800552 pde = proc_create("rt_cache", S_IRUGO,
553 net->proc_net_stat, &rt_cpu_seq_fops);
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800554 if (!pde)
555 goto err2;
556
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800557#ifdef CONFIG_NET_CLS_ROUTE
558 pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
559 ip_rt_acct_read, NULL);
560 if (!pde)
561 goto err3;
562#endif
563 return 0;
564
565#ifdef CONFIG_NET_CLS_ROUTE
566err3:
567 remove_proc_entry("rt_cache", net->proc_net_stat);
568#endif
569err2:
570 remove_proc_entry("rt_cache", net->proc_net);
571err1:
572 return -ENOMEM;
573}
Denis V. Lunev73b38712008-02-28 20:51:18 -0800574
575static void __net_exit ip_rt_do_proc_exit(struct net *net)
576{
577 remove_proc_entry("rt_cache", net->proc_net_stat);
578 remove_proc_entry("rt_cache", net->proc_net);
579 remove_proc_entry("rt_acct", net->proc_net);
580}
581
582static struct pernet_operations ip_rt_proc_ops __net_initdata = {
583 .init = ip_rt_do_proc_init,
584 .exit = ip_rt_do_proc_exit,
585};
586
587static int __init ip_rt_proc_init(void)
588{
589 return register_pernet_subsys(&ip_rt_proc_ops);
590}
591
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800592#else
Denis V. Lunev73b38712008-02-28 20:51:18 -0800593static inline int ip_rt_proc_init(void)
Pavel Emelyanov107f1632007-12-05 21:14:28 -0800594{
595 return 0;
596}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700597#endif /* CONFIG_PROC_FS */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900598
Stephen Hemminger5969f712008-04-10 01:52:09 -0700599static inline void rt_free(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700600{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700601 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
602}
603
Stephen Hemminger5969f712008-04-10 01:52:09 -0700604static inline void rt_drop(struct rtable *rt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700605{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700606 ip_rt_put(rt);
607 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
608}
609
Stephen Hemminger5969f712008-04-10 01:52:09 -0700610static inline int rt_fast_clean(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700611{
612 /* Kill broadcast/multicast entries very aggresively, if they
613 collide in hash table with more useful entries */
614 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800615 rth->fl.iif && rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700616}
617
Stephen Hemminger5969f712008-04-10 01:52:09 -0700618static inline int rt_valuable(struct rtable *rth)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700619{
620 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
621 rth->u.dst.expires;
622}
623
624static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
625{
626 unsigned long age;
627 int ret = 0;
628
629 if (atomic_read(&rth->u.dst.__refcnt))
630 goto out;
631
632 ret = 1;
633 if (rth->u.dst.expires &&
634 time_after_eq(jiffies, rth->u.dst.expires))
635 goto out;
636
637 age = jiffies - rth->u.dst.lastuse;
638 ret = 0;
639 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
640 (age <= tmo2 && rt_valuable(rth)))
641 goto out;
642 ret = 1;
643out: return ret;
644}
645
646/* Bits of score are:
647 * 31: very valuable
648 * 30: not quite useless
649 * 29..0: usage counter
650 */
651static inline u32 rt_score(struct rtable *rt)
652{
653 u32 score = jiffies - rt->u.dst.lastuse;
654
655 score = ~score & ~(3<<30);
656
657 if (rt_valuable(rt))
658 score |= (1<<31);
659
660 if (!rt->fl.iif ||
661 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
662 score |= (1<<30);
663
664 return score;
665}
666
667static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
668{
Al Viro714e85b2006-11-14 20:51:49 -0800669 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
670 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
Thomas Graf47dcf0c2006-11-09 15:20:38 -0800671 (fl1->mark ^ fl2->mark) |
David S. Miller8238b212006-10-12 00:49:15 -0700672 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
673 *(u16 *)&fl2->nl_u.ip4_u.tos) |
674 (fl1->oif ^ fl2->oif) |
675 (fl1->iif ^ fl2->iif)) == 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700676}
677
Denis V. Lunevb5921912008-01-22 23:50:25 -0800678static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
679{
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900680 return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
Denis V. Lunevb5921912008-01-22 23:50:25 -0800681}
682
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800683/*
684 * Perform a full scan of hash table and free all entries.
685 * Can be called by a softirq or a process.
686 * In the later case, we want to be reschedule if necessary
687 */
688static void rt_do_flush(int process_context)
689{
690 unsigned int i;
691 struct rtable *rth, *next;
692
693 for (i = 0; i <= rt_hash_mask; i++) {
694 if (process_context && need_resched())
695 cond_resched();
696 rth = rt_hash_table[i].chain;
697 if (!rth)
698 continue;
699
700 spin_lock_bh(rt_hash_lock_addr(i));
701 rth = rt_hash_table[i].chain;
702 rt_hash_table[i].chain = NULL;
703 spin_unlock_bh(rt_hash_lock_addr(i));
704
705 for (; rth; rth = next) {
706 next = rth->u.dst.rt_next;
707 rt_free(rth);
708 }
709 }
710}
711
712static void rt_check_expire(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700713{
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700714 static unsigned int rover;
715 unsigned int i = rover, goal;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700716 struct rtable *rth, **rthp;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700717 u64 mult;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700718
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700719 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
720 if (ip_rt_gc_timeout > 1)
721 do_div(mult, ip_rt_gc_timeout);
722 goal = (unsigned int)mult;
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700723 if (goal > rt_hash_mask)
724 goal = rt_hash_mask + 1;
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700725 for (; goal > 0; goal--) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700726 unsigned long tmo = ip_rt_gc_timeout;
727
728 i = (i + 1) & rt_hash_mask;
729 rthp = &rt_hash_table[i].chain;
730
Eric Dumazetd90bf5a2007-11-14 16:14:05 -0800731 if (need_resched())
732 cond_resched();
733
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700734 if (*rthp == NULL)
Eric Dumazetbb1d23b2005-07-05 15:00:32 -0700735 continue;
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700736 spin_lock_bh(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700737 while ((rth = *rthp) != NULL) {
Eric Dumazet29e75252008-01-31 17:05:09 -0800738 if (rth->rt_genid != atomic_read(&rt_genid)) {
739 *rthp = rth->u.dst.rt_next;
740 rt_free(rth);
741 continue;
742 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700743 if (rth->u.dst.expires) {
744 /* Entry is expired even if it is in use */
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700745 if (time_before_eq(jiffies, rth->u.dst.expires)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700746 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800747 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700748 continue;
749 }
750 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
751 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800752 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700753 continue;
754 }
755
756 /* Cleanup aged off entries. */
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800757 *rthp = rth->u.dst.rt_next;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900758 rt_free(rth);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700759 }
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700760 spin_unlock_bh(rt_hash_lock_addr(i));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700761 }
762 rover = i;
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800763}
764
765/*
766 * rt_worker_func() is run in process context.
Eric Dumazet29e75252008-01-31 17:05:09 -0800767 * we call rt_check_expire() to scan part of the hash table
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800768 */
769static void rt_worker_func(struct work_struct *work)
770{
Eric Dumazet29e75252008-01-31 17:05:09 -0800771 rt_check_expire();
Eric Dumazet39c90ec2007-09-15 10:55:54 -0700772 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700773}
774
Eric Dumazet29e75252008-01-31 17:05:09 -0800775/*
776 * Pertubation of rt_genid by a small quantity [1..256]
777 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
778 * many times (2^24) without giving recent rt_genid.
779 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700780 */
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700781static void rt_cache_invalidate(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700782{
Eric Dumazet29e75252008-01-31 17:05:09 -0800783 unsigned char shuffle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700784
Eric Dumazet29e75252008-01-31 17:05:09 -0800785 get_random_bytes(&shuffle, sizeof(shuffle));
786 atomic_add(shuffle + 1U, &rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700787}
788
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800789/*
Eric Dumazet29e75252008-01-31 17:05:09 -0800790 * delay < 0 : invalidate cache (fast : entries will be deleted later)
791 * delay >= 0 : invalidate & flush cache (can be long)
792 */
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -0700793void rt_cache_flush(struct net *net, int delay)
Eric Dumazet29e75252008-01-31 17:05:09 -0800794{
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700795 rt_cache_invalidate(net);
Eric Dumazet29e75252008-01-31 17:05:09 -0800796 if (delay >= 0)
797 rt_do_flush(!in_softirq());
798}
799
800/*
801 * We change rt_genid and let gc do the cleanup
Eric Dumazetbeb659b2007-11-19 22:43:37 -0800802 */
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -0700803static void rt_secret_rebuild(unsigned long __net)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700804{
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -0700805 struct net *net = (struct net *)__net;
Denis V. Lunev86c657f2008-07-05 19:03:31 -0700806 rt_cache_invalidate(net);
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -0700807 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700808}
809
810/*
811 Short description of GC goals.
812
813 We want to build algorithm, which will keep routing cache
814 at some equilibrium point, when number of aged off entries
815 is kept approximately equal to newly generated ones.
816
817 Current expiration strength is variable "expire".
818 We try to adjust it dynamically, so that if networking
819 is idle expires is large enough to keep enough of warm entries,
820 and when load increases it reduces to limit cache size.
821 */
822
Daniel Lezcano569d3642008-01-18 03:56:57 -0800823static int rt_garbage_collect(struct dst_ops *ops)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700824{
825 static unsigned long expire = RT_GC_TIMEOUT;
826 static unsigned long last_gc;
827 static int rover;
828 static int equilibrium;
829 struct rtable *rth, **rthp;
830 unsigned long now = jiffies;
831 int goal;
832
833 /*
834 * Garbage collection is pretty expensive,
835 * do not make it too frequently.
836 */
837
838 RT_CACHE_STAT_INC(gc_total);
839
840 if (now - last_gc < ip_rt_gc_min_interval &&
841 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
842 RT_CACHE_STAT_INC(gc_ignored);
843 goto out;
844 }
845
846 /* Calculate number of entries, which we want to expire now. */
847 goal = atomic_read(&ipv4_dst_ops.entries) -
848 (ip_rt_gc_elasticity << rt_hash_log);
849 if (goal <= 0) {
850 if (equilibrium < ipv4_dst_ops.gc_thresh)
851 equilibrium = ipv4_dst_ops.gc_thresh;
852 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
853 if (goal > 0) {
Eric Dumazetb790ced2007-12-21 01:49:07 -0800854 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700855 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
856 }
857 } else {
858 /* We are in dangerous area. Try to reduce cache really
859 * aggressively.
860 */
Eric Dumazetb790ced2007-12-21 01:49:07 -0800861 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700862 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
863 }
864
865 if (now - last_gc >= ip_rt_gc_min_interval)
866 last_gc = now;
867
868 if (goal <= 0) {
869 equilibrium += goal;
870 goto work_done;
871 }
872
873 do {
874 int i, k;
875
876 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
877 unsigned long tmo = expire;
878
879 k = (k + 1) & rt_hash_mask;
880 rthp = &rt_hash_table[k].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -0700881 spin_lock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700882 while ((rth = *rthp) != NULL) {
Eric Dumazet29e75252008-01-31 17:05:09 -0800883 if (rth->rt_genid == atomic_read(&rt_genid) &&
884 !rt_may_expire(rth, tmo, expire)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700885 tmo >>= 1;
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800886 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700887 continue;
888 }
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800889 *rthp = rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700890 rt_free(rth);
891 goal--;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700892 }
Eric Dumazet22c047c2005-07-05 14:55:24 -0700893 spin_unlock_bh(rt_hash_lock_addr(k));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700894 if (goal <= 0)
895 break;
896 }
897 rover = k;
898
899 if (goal <= 0)
900 goto work_done;
901
902 /* Goal is not achieved. We stop process if:
903
904 - if expire reduced to zero. Otherwise, expire is halfed.
905 - if table is not full.
906 - if we are called from interrupt.
907 - jiffies check is just fallback/debug loop breaker.
908 We will not spin here for long time in any case.
909 */
910
911 RT_CACHE_STAT_INC(gc_goal_miss);
912
913 if (expire == 0)
914 break;
915
916 expire >>= 1;
917#if RT_CACHE_DEBUG >= 2
918 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
919 atomic_read(&ipv4_dst_ops.entries), goal, i);
920#endif
921
922 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
923 goto out;
924 } while (!in_softirq() && time_before_eq(jiffies, now));
925
926 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
927 goto out;
928 if (net_ratelimit())
929 printk(KERN_WARNING "dst cache overflow\n");
930 RT_CACHE_STAT_INC(gc_dst_overflow);
931 return 1;
932
933work_done:
934 expire += ip_rt_gc_min_interval;
935 if (expire > ip_rt_gc_timeout ||
936 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
937 expire = ip_rt_gc_timeout;
938#if RT_CACHE_DEBUG >= 2
939 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
940 atomic_read(&ipv4_dst_ops.entries), goal, rover);
941#endif
942out: return 0;
943}
944
945static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
946{
947 struct rtable *rth, **rthp;
948 unsigned long now;
949 struct rtable *cand, **candp;
950 u32 min_score;
951 int chain_length;
952 int attempts = !in_softirq();
953
954restart:
955 chain_length = 0;
956 min_score = ~(u32)0;
957 cand = NULL;
958 candp = NULL;
959 now = jiffies;
960
961 rthp = &rt_hash_table[hash].chain;
962
Eric Dumazet22c047c2005-07-05 14:55:24 -0700963 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700964 while ((rth = *rthp) != NULL) {
Eric Dumazet29e75252008-01-31 17:05:09 -0800965 if (rth->rt_genid != atomic_read(&rt_genid)) {
966 *rthp = rth->u.dst.rt_next;
967 rt_free(rth);
968 continue;
969 }
Denis V. Lunevb5921912008-01-22 23:50:25 -0800970 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700971 /* Put it first */
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800972 *rthp = rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700973 /*
974 * Since lookup is lockfree, the deletion
975 * must be visible to another weakly ordered CPU before
976 * the insertion at the start of the hash chain.
977 */
Eric Dumazet093c2ca2007-02-09 16:19:26 -0800978 rcu_assign_pointer(rth->u.dst.rt_next,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700979 rt_hash_table[hash].chain);
980 /*
981 * Since lookup is lockfree, the update writes
982 * must be ordered for consistency on SMP.
983 */
984 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
985
Pavel Emelyanov03f49f32007-11-10 21:28:34 -0800986 dst_use(&rth->u.dst, now);
Eric Dumazet22c047c2005-07-05 14:55:24 -0700987 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700988
989 rt_drop(rt);
990 *rp = rth;
991 return 0;
992 }
993
994 if (!atomic_read(&rth->u.dst.__refcnt)) {
995 u32 score = rt_score(rth);
996
997 if (score <= min_score) {
998 cand = rth;
999 candp = rthp;
1000 min_score = score;
1001 }
1002 }
1003
1004 chain_length++;
1005
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001006 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001007 }
1008
1009 if (cand) {
1010 /* ip_rt_gc_elasticity used to be average length of chain
1011 * length, when exceeded gc becomes really aggressive.
1012 *
1013 * The second limit is less certain. At the moment it allows
1014 * only 2 entries per bucket. We will see.
1015 */
1016 if (chain_length > ip_rt_gc_elasticity) {
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001017 *candp = cand->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001018 rt_free(cand);
1019 }
1020 }
1021
1022 /* Try to bind route to arp only if it is output
1023 route or unicast forwarding path.
1024 */
1025 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1026 int err = arp_bind_neighbour(&rt->u.dst);
1027 if (err) {
Eric Dumazet22c047c2005-07-05 14:55:24 -07001028 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001029
1030 if (err != -ENOBUFS) {
1031 rt_drop(rt);
1032 return err;
1033 }
1034
1035 /* Neighbour tables are full and nothing
1036 can be released. Try to shrink route cache,
1037 it is most likely it holds some neighbour records.
1038 */
1039 if (attempts-- > 0) {
1040 int saved_elasticity = ip_rt_gc_elasticity;
1041 int saved_int = ip_rt_gc_min_interval;
1042 ip_rt_gc_elasticity = 1;
1043 ip_rt_gc_min_interval = 0;
Daniel Lezcano569d3642008-01-18 03:56:57 -08001044 rt_garbage_collect(&ipv4_dst_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001045 ip_rt_gc_min_interval = saved_int;
1046 ip_rt_gc_elasticity = saved_elasticity;
1047 goto restart;
1048 }
1049
1050 if (net_ratelimit())
1051 printk(KERN_WARNING "Neighbour table overflow.\n");
1052 rt_drop(rt);
1053 return -ENOBUFS;
1054 }
1055 }
1056
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001057 rt->u.dst.rt_next = rt_hash_table[hash].chain;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001058#if RT_CACHE_DEBUG >= 2
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001059 if (rt->u.dst.rt_next) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001060 struct rtable *trt;
YOSHIFUJI Hideakia7d632b2008-04-14 04:09:00 -07001061 printk(KERN_DEBUG "rt_cache @%02x: " NIPQUAD_FMT, hash,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001062 NIPQUAD(rt->rt_dst));
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001063 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
YOSHIFUJI Hideakia7d632b2008-04-14 04:09:00 -07001064 printk(" . " NIPQUAD_FMT, NIPQUAD(trt->rt_dst));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001065 printk("\n");
1066 }
1067#endif
1068 rt_hash_table[hash].chain = rt;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001069 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001070 *rp = rt;
1071 return 0;
1072}
1073
1074void rt_bind_peer(struct rtable *rt, int create)
1075{
1076 static DEFINE_SPINLOCK(rt_peer_lock);
1077 struct inet_peer *peer;
1078
1079 peer = inet_getpeer(rt->rt_dst, create);
1080
1081 spin_lock_bh(&rt_peer_lock);
1082 if (rt->peer == NULL) {
1083 rt->peer = peer;
1084 peer = NULL;
1085 }
1086 spin_unlock_bh(&rt_peer_lock);
1087 if (peer)
1088 inet_putpeer(peer);
1089}
1090
1091/*
1092 * Peer allocation may fail only in serious out-of-memory conditions. However
1093 * we still can generate some output.
1094 * Random ID selection looks a bit dangerous because we have no chances to
1095 * select ID being unique in a reasonable period of time.
1096 * But broken packet identifier may be better than no packet at all.
1097 */
1098static void ip_select_fb_ident(struct iphdr *iph)
1099{
1100 static DEFINE_SPINLOCK(ip_fb_id_lock);
1101 static u32 ip_fallback_id;
1102 u32 salt;
1103
1104 spin_lock_bh(&ip_fb_id_lock);
Al Viroe4485152006-09-26 22:15:01 -07001105 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001106 iph->id = htons(salt & 0xFFFF);
1107 ip_fallback_id = salt;
1108 spin_unlock_bh(&ip_fb_id_lock);
1109}
1110
1111void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1112{
1113 struct rtable *rt = (struct rtable *) dst;
1114
1115 if (rt) {
1116 if (rt->peer == NULL)
1117 rt_bind_peer(rt, 1);
1118
1119 /* If peer is attached to destination, it is never detached,
1120 so that we need not to grab a lock to dereference it.
1121 */
1122 if (rt->peer) {
1123 iph->id = htons(inet_getid(rt->peer, more));
1124 return;
1125 }
1126 } else
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001127 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
Stephen Hemminger9c2b3322005-04-19 22:39:42 -07001128 __builtin_return_address(0));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001129
1130 ip_select_fb_ident(iph);
1131}
1132
1133static void rt_del(unsigned hash, struct rtable *rt)
1134{
Eric Dumazet29e75252008-01-31 17:05:09 -08001135 struct rtable **rthp, *aux;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001136
Eric Dumazet29e75252008-01-31 17:05:09 -08001137 rthp = &rt_hash_table[hash].chain;
Eric Dumazet22c047c2005-07-05 14:55:24 -07001138 spin_lock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001139 ip_rt_put(rt);
Eric Dumazet29e75252008-01-31 17:05:09 -08001140 while ((aux = *rthp) != NULL) {
1141 if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
1142 *rthp = aux->u.dst.rt_next;
1143 rt_free(aux);
1144 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001145 }
Eric Dumazet29e75252008-01-31 17:05:09 -08001146 rthp = &aux->u.dst.rt_next;
1147 }
Eric Dumazet22c047c2005-07-05 14:55:24 -07001148 spin_unlock_bh(rt_hash_lock_addr(hash));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001149}
1150
Al Virof7655222006-09-26 21:25:43 -07001151void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1152 __be32 saddr, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001153{
1154 int i, k;
1155 struct in_device *in_dev = in_dev_get(dev);
1156 struct rtable *rth, **rthp;
Al Virof7655222006-09-26 21:25:43 -07001157 __be32 skeys[2] = { saddr, 0 };
Linus Torvalds1da177e2005-04-16 15:20:36 -07001158 int ikeys[2] = { dev->ifindex, 0 };
Tom Tucker8d717402006-07-30 20:43:36 -07001159 struct netevent_redirect netevent;
Denis V. Lunev317805b2008-02-28 20:50:06 -08001160 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001161
Linus Torvalds1da177e2005-04-16 15:20:36 -07001162 if (!in_dev)
1163 return;
1164
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001165 net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001166 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001167 || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
Joe Perchesf97c1e02007-12-16 13:45:43 -08001168 || ipv4_is_zeronet(new_gw))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001169 goto reject_redirect;
1170
1171 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1172 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1173 goto reject_redirect;
1174 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1175 goto reject_redirect;
1176 } else {
Denis V. Lunev317805b2008-02-28 20:50:06 -08001177 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001178 goto reject_redirect;
1179 }
1180
1181 for (i = 0; i < 2; i++) {
1182 for (k = 0; k < 2; k++) {
Al Viro8c7bc842006-09-26 21:26:19 -07001183 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001184
1185 rthp=&rt_hash_table[hash].chain;
1186
1187 rcu_read_lock();
1188 while ((rth = rcu_dereference(*rthp)) != NULL) {
1189 struct rtable *rt;
1190
1191 if (rth->fl.fl4_dst != daddr ||
1192 rth->fl.fl4_src != skeys[i] ||
Linus Torvalds1da177e2005-04-16 15:20:36 -07001193 rth->fl.oif != ikeys[k] ||
Eric Dumazet29e75252008-01-31 17:05:09 -08001194 rth->fl.iif != 0 ||
Denis V. Lunev317805b2008-02-28 20:50:06 -08001195 rth->rt_genid != atomic_read(&rt_genid) ||
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09001196 !net_eq(dev_net(rth->u.dst.dev), net)) {
Eric Dumazet093c2ca2007-02-09 16:19:26 -08001197 rthp = &rth->u.dst.rt_next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001198 continue;
1199 }
1200
1201 if (rth->rt_dst != daddr ||
1202 rth->rt_src != saddr ||
1203 rth->u.dst.error ||
1204 rth->rt_gateway != old_gw ||
1205 rth->u.dst.dev != dev)
1206 break;
1207
1208 dst_hold(&rth->u.dst);
1209 rcu_read_unlock();
1210
1211 rt = dst_alloc(&ipv4_dst_ops);
1212 if (rt == NULL) {
1213 ip_rt_put(rth);
1214 in_dev_put(in_dev);
1215 return;
1216 }
1217
1218 /* Copy all the information. */
1219 *rt = *rth;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001220 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001221 rt->u.dst.__use = 1;
1222 atomic_set(&rt->u.dst.__refcnt, 1);
1223 rt->u.dst.child = NULL;
1224 if (rt->u.dst.dev)
1225 dev_hold(rt->u.dst.dev);
1226 if (rt->idev)
1227 in_dev_hold(rt->idev);
1228 rt->u.dst.obsolete = 0;
1229 rt->u.dst.lastuse = jiffies;
1230 rt->u.dst.path = &rt->u.dst;
1231 rt->u.dst.neighbour = NULL;
1232 rt->u.dst.hh = NULL;
1233 rt->u.dst.xfrm = NULL;
Eric Dumazet29e75252008-01-31 17:05:09 -08001234 rt->rt_genid = atomic_read(&rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001235 rt->rt_flags |= RTCF_REDIRECTED;
1236
1237 /* Gateway is different ... */
1238 rt->rt_gateway = new_gw;
1239
1240 /* Redirect received -> path was valid */
1241 dst_confirm(&rth->u.dst);
1242
1243 if (rt->peer)
1244 atomic_inc(&rt->peer->refcnt);
1245
1246 if (arp_bind_neighbour(&rt->u.dst) ||
1247 !(rt->u.dst.neighbour->nud_state &
1248 NUD_VALID)) {
1249 if (rt->u.dst.neighbour)
1250 neigh_event_send(rt->u.dst.neighbour, NULL);
1251 ip_rt_put(rth);
1252 rt_drop(rt);
1253 goto do_next;
1254 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001255
Tom Tucker8d717402006-07-30 20:43:36 -07001256 netevent.old = &rth->u.dst;
1257 netevent.new = &rt->u.dst;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001258 call_netevent_notifiers(NETEVENT_REDIRECT,
1259 &netevent);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001260
1261 rt_del(hash, rth);
1262 if (!rt_intern_hash(hash, rt, &rt))
1263 ip_rt_put(rt);
1264 goto do_next;
1265 }
1266 rcu_read_unlock();
1267 do_next:
1268 ;
1269 }
1270 }
1271 in_dev_put(in_dev);
1272 return;
1273
1274reject_redirect:
1275#ifdef CONFIG_IP_ROUTE_VERBOSE
1276 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
YOSHIFUJI Hideakia7d632b2008-04-14 04:09:00 -07001277 printk(KERN_INFO "Redirect from " NIPQUAD_FMT " on %s about "
1278 NIPQUAD_FMT " ignored.\n"
1279 " Advised path = " NIPQUAD_FMT " -> " NIPQUAD_FMT "\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07001280 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
Ilia Sotnikovcef26852006-03-25 01:38:55 -08001281 NIPQUAD(saddr), NIPQUAD(daddr));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001282#endif
1283 in_dev_put(in_dev);
1284}
1285
1286static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1287{
Eric Dumazetee6b9672008-03-05 18:30:47 -08001288 struct rtable *rt = (struct rtable *)dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001289 struct dst_entry *ret = dst;
1290
1291 if (rt) {
1292 if (dst->obsolete) {
1293 ip_rt_put(rt);
1294 ret = NULL;
1295 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1296 rt->u.dst.expires) {
Al Viro8c7bc842006-09-26 21:26:19 -07001297 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1298 rt->fl.oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001299#if RT_CACHE_DEBUG >= 1
Denis V. Lunev56c99d02007-12-06 02:19:07 -08001300 printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
YOSHIFUJI Hideakia7d632b2008-04-14 04:09:00 -07001301 NIPQUAD_FMT "/%02x dropped\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07001302 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1303#endif
1304 rt_del(hash, rt);
1305 ret = NULL;
1306 }
1307 }
1308 return ret;
1309}
1310
1311/*
1312 * Algorithm:
1313 * 1. The first ip_rt_redirect_number redirects are sent
1314 * with exponential backoff, then we stop sending them at all,
1315 * assuming that the host ignores our redirects.
1316 * 2. If we did not see packets requiring redirects
1317 * during ip_rt_redirect_silence, we assume that the host
1318 * forgot redirected route and start to send redirects again.
1319 *
1320 * This algorithm is much cheaper and more intelligent than dumb load limiting
1321 * in icmp.c.
1322 *
1323 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1324 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1325 */
1326
1327void ip_rt_send_redirect(struct sk_buff *skb)
1328{
Eric Dumazetee6b9672008-03-05 18:30:47 -08001329 struct rtable *rt = skb->rtable;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001330 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1331
1332 if (!in_dev)
1333 return;
1334
1335 if (!IN_DEV_TX_REDIRECTS(in_dev))
1336 goto out;
1337
1338 /* No redirected packets during ip_rt_redirect_silence;
1339 * reset the algorithm.
1340 */
1341 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1342 rt->u.dst.rate_tokens = 0;
1343
1344 /* Too many ignored redirects; do not send anything
1345 * set u.dst.rate_last to the last seen redirected packet.
1346 */
1347 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1348 rt->u.dst.rate_last = jiffies;
1349 goto out;
1350 }
1351
1352 /* Check for load limit; set rate_last to the latest sent
1353 * redirect.
1354 */
Li Yewang14fb8a72006-12-18 00:26:35 -08001355 if (rt->u.dst.rate_tokens == 0 ||
1356 time_after(jiffies,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001357 (rt->u.dst.rate_last +
1358 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1359 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1360 rt->u.dst.rate_last = jiffies;
1361 ++rt->u.dst.rate_tokens;
1362#ifdef CONFIG_IP_ROUTE_VERBOSE
1363 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1364 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1365 net_ratelimit())
YOSHIFUJI Hideakia7d632b2008-04-14 04:09:00 -07001366 printk(KERN_WARNING "host " NIPQUAD_FMT "/if%d ignores "
1367 "redirects for " NIPQUAD_FMT " to " NIPQUAD_FMT ".\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07001368 NIPQUAD(rt->rt_src), rt->rt_iif,
1369 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1370#endif
1371 }
1372out:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001373 in_dev_put(in_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001374}
1375
1376static int ip_error(struct sk_buff *skb)
1377{
Eric Dumazetee6b9672008-03-05 18:30:47 -08001378 struct rtable *rt = skb->rtable;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001379 unsigned long now;
1380 int code;
1381
1382 switch (rt->u.dst.error) {
1383 case EINVAL:
1384 default:
1385 goto out;
1386 case EHOSTUNREACH:
1387 code = ICMP_HOST_UNREACH;
1388 break;
1389 case ENETUNREACH:
1390 code = ICMP_NET_UNREACH;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08001391 IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001392 break;
1393 case EACCES:
1394 code = ICMP_PKT_FILTERED;
1395 break;
1396 }
1397
1398 now = jiffies;
1399 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1400 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1401 rt->u.dst.rate_tokens = ip_rt_error_burst;
1402 rt->u.dst.rate_last = now;
1403 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1404 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1405 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1406 }
1407
1408out: kfree_skb(skb);
1409 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001410}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001411
1412/*
1413 * The last two values are not from the RFC but
1414 * are needed for AMPRnet AX.25 paths.
1415 */
1416
Arjan van de Ven9b5b5cf2005-11-29 16:21:38 -08001417static const unsigned short mtu_plateau[] =
Linus Torvalds1da177e2005-04-16 15:20:36 -07001418{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1419
Stephen Hemminger5969f712008-04-10 01:52:09 -07001420static inline unsigned short guess_mtu(unsigned short old_mtu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001421{
1422 int i;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001423
Linus Torvalds1da177e2005-04-16 15:20:36 -07001424 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1425 if (old_mtu > mtu_plateau[i])
1426 return mtu_plateau[i];
1427 return 68;
1428}
1429
Denis V. Lunevb5921912008-01-22 23:50:25 -08001430unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
Timo Teras0010e462008-04-29 03:32:25 -07001431 unsigned short new_mtu,
1432 struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001433{
Timo Teras0010e462008-04-29 03:32:25 -07001434 int i, k;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001435 unsigned short old_mtu = ntohs(iph->tot_len);
1436 struct rtable *rth;
Timo Teras0010e462008-04-29 03:32:25 -07001437 int ikeys[2] = { dev->ifindex, 0 };
Al Viroe4485152006-09-26 22:15:01 -07001438 __be32 skeys[2] = { iph->saddr, 0, };
1439 __be32 daddr = iph->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001440 unsigned short est_mtu = 0;
1441
1442 if (ipv4_config.no_pmtu_disc)
1443 return 0;
1444
Timo Teras0010e462008-04-29 03:32:25 -07001445 for (k = 0; k < 2; k++) {
1446 for (i = 0; i < 2; i++) {
1447 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001448
Timo Teras0010e462008-04-29 03:32:25 -07001449 rcu_read_lock();
1450 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1451 rth = rcu_dereference(rth->u.dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001452 unsigned short mtu = new_mtu;
1453
Timo Teras0010e462008-04-29 03:32:25 -07001454 if (rth->fl.fl4_dst != daddr ||
1455 rth->fl.fl4_src != skeys[i] ||
1456 rth->rt_dst != daddr ||
1457 rth->rt_src != iph->saddr ||
1458 rth->fl.oif != ikeys[k] ||
1459 rth->fl.iif != 0 ||
1460 dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1461 !net_eq(dev_net(rth->u.dst.dev), net) ||
1462 rth->rt_genid != atomic_read(&rt_genid))
1463 continue;
1464
Linus Torvalds1da177e2005-04-16 15:20:36 -07001465 if (new_mtu < 68 || new_mtu >= old_mtu) {
1466
1467 /* BSD 4.2 compatibility hack :-( */
1468 if (mtu == 0 &&
Satoru SATOH5ffc02a2008-05-04 22:14:42 -07001469 old_mtu >= dst_metric(&rth->u.dst, RTAX_MTU) &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001470 old_mtu >= 68 + (iph->ihl << 2))
1471 old_mtu -= iph->ihl << 2;
1472
1473 mtu = guess_mtu(old_mtu);
1474 }
Satoru SATOH5ffc02a2008-05-04 22:14:42 -07001475 if (mtu <= dst_metric(&rth->u.dst, RTAX_MTU)) {
1476 if (mtu < dst_metric(&rth->u.dst, RTAX_MTU)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001477 dst_confirm(&rth->u.dst);
1478 if (mtu < ip_rt_min_pmtu) {
1479 mtu = ip_rt_min_pmtu;
1480 rth->u.dst.metrics[RTAX_LOCK-1] |=
1481 (1 << RTAX_MTU);
1482 }
1483 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1484 dst_set_expires(&rth->u.dst,
1485 ip_rt_mtu_expires);
1486 }
1487 est_mtu = mtu;
1488 }
1489 }
Timo Teras0010e462008-04-29 03:32:25 -07001490 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001491 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001492 }
1493 return est_mtu ? : new_mtu;
1494}
1495
1496static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1497{
Satoru SATOH5ffc02a2008-05-04 22:14:42 -07001498 if (dst_metric(dst, RTAX_MTU) > mtu && mtu >= 68 &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001499 !(dst_metric_locked(dst, RTAX_MTU))) {
1500 if (mtu < ip_rt_min_pmtu) {
1501 mtu = ip_rt_min_pmtu;
1502 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1503 }
1504 dst->metrics[RTAX_MTU-1] = mtu;
1505 dst_set_expires(dst, ip_rt_mtu_expires);
Tom Tucker8d717402006-07-30 20:43:36 -07001506 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001507 }
1508}
1509
1510static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1511{
1512 return NULL;
1513}
1514
1515static void ipv4_dst_destroy(struct dst_entry *dst)
1516{
1517 struct rtable *rt = (struct rtable *) dst;
1518 struct inet_peer *peer = rt->peer;
1519 struct in_device *idev = rt->idev;
1520
1521 if (peer) {
1522 rt->peer = NULL;
1523 inet_putpeer(peer);
1524 }
1525
1526 if (idev) {
1527 rt->idev = NULL;
1528 in_dev_put(idev);
1529 }
1530}
1531
1532static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1533 int how)
1534{
1535 struct rtable *rt = (struct rtable *) dst;
1536 struct in_device *idev = rt->idev;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001537 if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
Denis V. Lunev5a3e55d2007-12-07 00:38:10 -08001538 struct in_device *loopback_idev =
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001539 in_dev_get(dev_net(dev)->loopback_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001540 if (loopback_idev) {
1541 rt->idev = loopback_idev;
1542 in_dev_put(idev);
1543 }
1544 }
1545}
1546
1547static void ipv4_link_failure(struct sk_buff *skb)
1548{
1549 struct rtable *rt;
1550
1551 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1552
Eric Dumazetee6b9672008-03-05 18:30:47 -08001553 rt = skb->rtable;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001554 if (rt)
1555 dst_set_expires(&rt->u.dst, 0);
1556}
1557
1558static int ip_rt_bug(struct sk_buff *skb)
1559{
YOSHIFUJI Hideakia7d632b2008-04-14 04:09:00 -07001560 printk(KERN_DEBUG "ip_rt_bug: " NIPQUAD_FMT " -> " NIPQUAD_FMT ", %s\n",
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001561 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
Linus Torvalds1da177e2005-04-16 15:20:36 -07001562 skb->dev ? skb->dev->name : "?");
1563 kfree_skb(skb);
1564 return 0;
1565}
1566
1567/*
1568 We do not cache source address of outgoing interface,
1569 because it is used only by IP RR, TS and SRR options,
1570 so that it out of fast path.
1571
1572 BTW remember: "addr" is allowed to be not aligned
1573 in IP options!
1574 */
1575
1576void ip_rt_get_source(u8 *addr, struct rtable *rt)
1577{
Al Viroa61ced52006-09-26 21:27:54 -07001578 __be32 src;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001579 struct fib_result res;
1580
1581 if (rt->fl.iif == 0)
1582 src = rt->rt_src;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001583 else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001584 src = FIB_RES_PREFSRC(res);
1585 fib_res_put(&res);
1586 } else
1587 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1588 RT_SCOPE_UNIVERSE);
1589 memcpy(addr, &src, 4);
1590}
1591
1592#ifdef CONFIG_NET_CLS_ROUTE
1593static void set_class_tag(struct rtable *rt, u32 tag)
1594{
1595 if (!(rt->u.dst.tclassid & 0xFFFF))
1596 rt->u.dst.tclassid |= tag & 0xFFFF;
1597 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1598 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1599}
1600#endif
1601
1602static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1603{
1604 struct fib_info *fi = res->fi;
1605
1606 if (fi) {
1607 if (FIB_RES_GW(*res) &&
1608 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1609 rt->rt_gateway = FIB_RES_GW(*res);
1610 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1611 sizeof(rt->u.dst.metrics));
1612 if (fi->fib_mtu == 0) {
1613 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
Satoru SATOH0bbeafd2008-05-04 22:12:43 -07001614 if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001615 rt->rt_gateway != rt->rt_dst &&
1616 rt->u.dst.dev->mtu > 576)
1617 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1618 }
1619#ifdef CONFIG_NET_CLS_ROUTE
1620 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1621#endif
1622 } else
1623 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1624
Satoru SATOH5ffc02a2008-05-04 22:14:42 -07001625 if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001626 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
Satoru SATOH5ffc02a2008-05-04 22:14:42 -07001627 if (dst_metric(&rt->u.dst, RTAX_MTU) > IP_MAX_MTU)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001628 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
Satoru SATOH5ffc02a2008-05-04 22:14:42 -07001629 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001630 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1631 ip_rt_min_advmss);
Satoru SATOH5ffc02a2008-05-04 22:14:42 -07001632 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001633 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1634
1635#ifdef CONFIG_NET_CLS_ROUTE
1636#ifdef CONFIG_IP_MULTIPLE_TABLES
1637 set_class_tag(rt, fib_rules_tclass(res));
1638#endif
1639 set_class_tag(rt, itag);
1640#endif
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001641 rt->rt_type = res->type;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001642}
1643
Al Viro9e12bb22006-09-26 21:25:20 -07001644static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001645 u8 tos, struct net_device *dev, int our)
1646{
1647 unsigned hash;
1648 struct rtable *rth;
Al Viroa61ced52006-09-26 21:27:54 -07001649 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001650 struct in_device *in_dev = in_dev_get(dev);
1651 u32 itag = 0;
1652
1653 /* Primary sanity checks. */
1654
1655 if (in_dev == NULL)
1656 return -EINVAL;
1657
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001658 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08001659 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001660 goto e_inval;
1661
Joe Perchesf97c1e02007-12-16 13:45:43 -08001662 if (ipv4_is_zeronet(saddr)) {
1663 if (!ipv4_is_local_multicast(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001664 goto e_inval;
1665 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1666 } else if (fib_validate_source(saddr, 0, tos, 0,
1667 dev, &spec_dst, &itag) < 0)
1668 goto e_inval;
1669
1670 rth = dst_alloc(&ipv4_dst_ops);
1671 if (!rth)
1672 goto e_nobufs;
1673
1674 rth->u.dst.output= ip_rt_bug;
1675
1676 atomic_set(&rth->u.dst.__refcnt, 1);
1677 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07001678 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001679 rth->u.dst.flags |= DST_NOPOLICY;
1680 rth->fl.fl4_dst = daddr;
1681 rth->rt_dst = daddr;
1682 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001683 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001684 rth->fl.fl4_src = saddr;
1685 rth->rt_src = saddr;
1686#ifdef CONFIG_NET_CLS_ROUTE
1687 rth->u.dst.tclassid = itag;
1688#endif
1689 rth->rt_iif =
1690 rth->fl.iif = dev->ifindex;
Eric W. Biederman2774c7a2007-09-26 22:10:56 -07001691 rth->u.dst.dev = init_net.loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001692 dev_hold(rth->u.dst.dev);
1693 rth->idev = in_dev_get(rth->u.dst.dev);
1694 rth->fl.oif = 0;
1695 rth->rt_gateway = daddr;
1696 rth->rt_spec_dst= spec_dst;
Eric Dumazet29e75252008-01-31 17:05:09 -08001697 rth->rt_genid = atomic_read(&rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001698 rth->rt_flags = RTCF_MULTICAST;
Eric Dumazet29e75252008-01-31 17:05:09 -08001699 rth->rt_type = RTN_MULTICAST;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001700 if (our) {
1701 rth->u.dst.input= ip_local_deliver;
1702 rth->rt_flags |= RTCF_LOCAL;
1703 }
1704
1705#ifdef CONFIG_IP_MROUTE
Joe Perchesf97c1e02007-12-16 13:45:43 -08001706 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001707 rth->u.dst.input = ip_mr_input;
1708#endif
1709 RT_CACHE_STAT_INC(in_slow_mc);
1710
1711 in_dev_put(in_dev);
Al Viro8c7bc842006-09-26 21:26:19 -07001712 hash = rt_hash(daddr, saddr, dev->ifindex);
Eric Dumazetee6b9672008-03-05 18:30:47 -08001713 return rt_intern_hash(hash, rth, &skb->rtable);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001714
1715e_nobufs:
1716 in_dev_put(in_dev);
1717 return -ENOBUFS;
1718
1719e_inval:
1720 in_dev_put(in_dev);
1721 return -EINVAL;
1722}
1723
1724
1725static void ip_handle_martian_source(struct net_device *dev,
1726 struct in_device *in_dev,
1727 struct sk_buff *skb,
Al Viro9e12bb22006-09-26 21:25:20 -07001728 __be32 daddr,
1729 __be32 saddr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001730{
1731 RT_CACHE_STAT_INC(in_martian_src);
1732#ifdef CONFIG_IP_ROUTE_VERBOSE
1733 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1734 /*
1735 * RFC1812 recommendation, if source is martian,
1736 * the only hint is MAC header.
1737 */
YOSHIFUJI Hideakia7d632b2008-04-14 04:09:00 -07001738 printk(KERN_WARNING "martian source " NIPQUAD_FMT " from "
1739 NIPQUAD_FMT", on dev %s\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07001740 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001741 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001742 int i;
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001743 const unsigned char *p = skb_mac_header(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001744 printk(KERN_WARNING "ll header: ");
1745 for (i = 0; i < dev->hard_header_len; i++, p++) {
1746 printk("%02x", *p);
1747 if (i < (dev->hard_header_len - 1))
1748 printk(":");
1749 }
1750 printk("\n");
1751 }
1752 }
1753#endif
1754}
1755
Stephen Hemminger5969f712008-04-10 01:52:09 -07001756static int __mkroute_input(struct sk_buff *skb,
1757 struct fib_result *res,
1758 struct in_device *in_dev,
1759 __be32 daddr, __be32 saddr, u32 tos,
1760 struct rtable **result)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001761{
1762
1763 struct rtable *rth;
1764 int err;
1765 struct in_device *out_dev;
1766 unsigned flags = 0;
Al Virod9c9df82006-09-26 21:28:14 -07001767 __be32 spec_dst;
1768 u32 itag;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001769
1770 /* get a working reference to the output device */
1771 out_dev = in_dev_get(FIB_RES_DEV(*res));
1772 if (out_dev == NULL) {
1773 if (net_ratelimit())
1774 printk(KERN_CRIT "Bug in ip_route_input" \
1775 "_slow(). Please, report\n");
1776 return -EINVAL;
1777 }
1778
1779
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001780 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
Linus Torvalds1da177e2005-04-16 15:20:36 -07001781 in_dev->dev, &spec_dst, &itag);
1782 if (err < 0) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001783 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001784 saddr);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001785
Linus Torvalds1da177e2005-04-16 15:20:36 -07001786 err = -EINVAL;
1787 goto cleanup;
1788 }
1789
1790 if (err)
1791 flags |= RTCF_DIRECTSRC;
1792
Thomas Graf51b77ca2008-06-03 16:36:01 -07001793 if (out_dev == in_dev && err &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001794 (IN_DEV_SHARED_MEDIA(out_dev) ||
1795 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1796 flags |= RTCF_DOREDIRECT;
1797
1798 if (skb->protocol != htons(ETH_P_IP)) {
1799 /* Not IP (i.e. ARP). Do not create route, if it is
1800 * invalid for proxy arp. DNAT routes are always valid.
1801 */
Rami Rosencb7928a2008-01-09 00:18:24 -08001802 if (out_dev == in_dev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001803 err = -EINVAL;
1804 goto cleanup;
1805 }
1806 }
1807
1808
1809 rth = dst_alloc(&ipv4_dst_ops);
1810 if (!rth) {
1811 err = -ENOBUFS;
1812 goto cleanup;
1813 }
1814
Julian Anastasovce723d82005-09-08 13:34:47 -07001815 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001816 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07001817 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001818 rth->u.dst.flags |= DST_NOPOLICY;
Herbert Xu42f811b2007-06-04 23:34:44 -07001819 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001820 rth->u.dst.flags |= DST_NOXFRM;
1821 rth->fl.fl4_dst = daddr;
1822 rth->rt_dst = daddr;
1823 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001824 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001825 rth->fl.fl4_src = saddr;
1826 rth->rt_src = saddr;
1827 rth->rt_gateway = daddr;
1828 rth->rt_iif =
1829 rth->fl.iif = in_dev->dev->ifindex;
1830 rth->u.dst.dev = (out_dev)->dev;
1831 dev_hold(rth->u.dst.dev);
1832 rth->idev = in_dev_get(rth->u.dst.dev);
1833 rth->fl.oif = 0;
1834 rth->rt_spec_dst= spec_dst;
1835
1836 rth->u.dst.input = ip_forward;
1837 rth->u.dst.output = ip_output;
Eric Dumazet29e75252008-01-31 17:05:09 -08001838 rth->rt_genid = atomic_read(&rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001839
1840 rt_set_nexthop(rth, res, itag);
1841
1842 rth->rt_flags = flags;
1843
1844 *result = rth;
1845 err = 0;
1846 cleanup:
1847 /* release the working reference to the output device */
1848 in_dev_put(out_dev);
1849 return err;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001850}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001851
Stephen Hemminger5969f712008-04-10 01:52:09 -07001852static int ip_mkroute_input(struct sk_buff *skb,
1853 struct fib_result *res,
1854 const struct flowi *fl,
1855 struct in_device *in_dev,
1856 __be32 daddr, __be32 saddr, u32 tos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001857{
Chuck Short7abaa272005-06-22 22:10:23 -07001858 struct rtable* rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001859 int err;
1860 unsigned hash;
1861
1862#ifdef CONFIG_IP_ROUTE_MULTIPATH
1863 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1864 fib_select_multipath(fl, res);
1865#endif
1866
1867 /* create a routing cache entry */
1868 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1869 if (err)
1870 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001871
1872 /* put it into the cache */
Al Viro8c7bc842006-09-26 21:26:19 -07001873 hash = rt_hash(daddr, saddr, fl->iif);
Eric Dumazetee6b9672008-03-05 18:30:47 -08001874 return rt_intern_hash(hash, rth, &skb->rtable);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001875}
1876
Linus Torvalds1da177e2005-04-16 15:20:36 -07001877/*
1878 * NOTE. We drop all the packets that has local source
1879 * addresses, because every properly looped back packet
1880 * must have correct destination already attached by output routine.
1881 *
1882 * Such approach solves two big problems:
1883 * 1. Not simplex devices are handled properly.
1884 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1885 */
1886
Al Viro9e12bb22006-09-26 21:25:20 -07001887static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001888 u8 tos, struct net_device *dev)
1889{
1890 struct fib_result res;
1891 struct in_device *in_dev = in_dev_get(dev);
1892 struct flowi fl = { .nl_u = { .ip4_u =
1893 { .daddr = daddr,
1894 .saddr = saddr,
1895 .tos = tos,
1896 .scope = RT_SCOPE_UNIVERSE,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001897 } },
Thomas Graf47dcf0c2006-11-09 15:20:38 -08001898 .mark = skb->mark,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001899 .iif = dev->ifindex };
1900 unsigned flags = 0;
1901 u32 itag = 0;
1902 struct rtable * rth;
1903 unsigned hash;
Al Viro9e12bb22006-09-26 21:25:20 -07001904 __be32 spec_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001905 int err = -EINVAL;
1906 int free_res = 0;
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09001907 struct net * net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001908
1909 /* IP on this device is disabled. */
1910
1911 if (!in_dev)
1912 goto out;
1913
1914 /* Check for the most weird martians, which can be not detected
1915 by fib_lookup.
1916 */
1917
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001918 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08001919 ipv4_is_loopback(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001920 goto martian_source;
1921
Al Viroe4485152006-09-26 22:15:01 -07001922 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001923 goto brd_input;
1924
1925 /* Accept zero addresses only to limited broadcast;
1926 * I even do not know to fix it or not. Waiting for complains :-)
1927 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08001928 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001929 goto martian_source;
1930
Jan Engelhardt1e637c72008-01-21 03:18:08 -08001931 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08001932 ipv4_is_loopback(daddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001933 goto martian_destination;
1934
1935 /*
1936 * Now we are ready to route packet.
1937 */
Denis V. Lunev84a885f2008-01-21 17:34:35 -08001938 if ((err = fib_lookup(net, &fl, &res)) != 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001939 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001940 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001941 goto no_route;
1942 }
1943 free_res = 1;
1944
1945 RT_CACHE_STAT_INC(in_slow_tot);
1946
1947 if (res.type == RTN_BROADCAST)
1948 goto brd_input;
1949
1950 if (res.type == RTN_LOCAL) {
1951 int result;
1952 result = fib_validate_source(saddr, daddr, tos,
Denis V. Lunev84a885f2008-01-21 17:34:35 -08001953 net->loopback_dev->ifindex,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001954 dev, &spec_dst, &itag);
1955 if (result < 0)
1956 goto martian_source;
1957 if (result)
1958 flags |= RTCF_DIRECTSRC;
1959 spec_dst = daddr;
1960 goto local_input;
1961 }
1962
1963 if (!IN_DEV_FORWARD(in_dev))
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07001964 goto e_hostunreach;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001965 if (res.type != RTN_UNICAST)
1966 goto martian_destination;
1967
1968 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001969done:
1970 in_dev_put(in_dev);
1971 if (free_res)
1972 fib_res_put(&res);
1973out: return err;
1974
1975brd_input:
1976 if (skb->protocol != htons(ETH_P_IP))
1977 goto e_inval;
1978
Joe Perchesf97c1e02007-12-16 13:45:43 -08001979 if (ipv4_is_zeronet(saddr))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001980 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1981 else {
1982 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1983 &itag);
1984 if (err < 0)
1985 goto martian_source;
1986 if (err)
1987 flags |= RTCF_DIRECTSRC;
1988 }
1989 flags |= RTCF_BROADCAST;
1990 res.type = RTN_BROADCAST;
1991 RT_CACHE_STAT_INC(in_brd);
1992
1993local_input:
1994 rth = dst_alloc(&ipv4_dst_ops);
1995 if (!rth)
1996 goto e_nobufs;
1997
1998 rth->u.dst.output= ip_rt_bug;
Eric Dumazet29e75252008-01-31 17:05:09 -08001999 rth->rt_genid = atomic_read(&rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002000
2001 atomic_set(&rth->u.dst.__refcnt, 1);
2002 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07002003 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002004 rth->u.dst.flags |= DST_NOPOLICY;
2005 rth->fl.fl4_dst = daddr;
2006 rth->rt_dst = daddr;
2007 rth->fl.fl4_tos = tos;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002008 rth->fl.mark = skb->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002009 rth->fl.fl4_src = saddr;
2010 rth->rt_src = saddr;
2011#ifdef CONFIG_NET_CLS_ROUTE
2012 rth->u.dst.tclassid = itag;
2013#endif
2014 rth->rt_iif =
2015 rth->fl.iif = dev->ifindex;
Denis V. Lunev84a885f2008-01-21 17:34:35 -08002016 rth->u.dst.dev = net->loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002017 dev_hold(rth->u.dst.dev);
2018 rth->idev = in_dev_get(rth->u.dst.dev);
2019 rth->rt_gateway = daddr;
2020 rth->rt_spec_dst= spec_dst;
2021 rth->u.dst.input= ip_local_deliver;
2022 rth->rt_flags = flags|RTCF_LOCAL;
2023 if (res.type == RTN_UNREACHABLE) {
2024 rth->u.dst.input= ip_error;
2025 rth->u.dst.error= -err;
2026 rth->rt_flags &= ~RTCF_LOCAL;
2027 }
2028 rth->rt_type = res.type;
Al Viro8c7bc842006-09-26 21:26:19 -07002029 hash = rt_hash(daddr, saddr, fl.iif);
Eric Dumazetee6b9672008-03-05 18:30:47 -08002030 err = rt_intern_hash(hash, rth, &skb->rtable);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002031 goto done;
2032
2033no_route:
2034 RT_CACHE_STAT_INC(in_no_route);
2035 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2036 res.type = RTN_UNREACHABLE;
Mitsuru Chinen7f538782007-12-07 01:07:24 -08002037 if (err == -ESRCH)
2038 err = -ENETUNREACH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002039 goto local_input;
2040
2041 /*
2042 * Do not cache martian addresses: they should be logged (RFC1812)
2043 */
2044martian_destination:
2045 RT_CACHE_STAT_INC(in_martian_dst);
2046#ifdef CONFIG_IP_ROUTE_VERBOSE
2047 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
YOSHIFUJI Hideakia7d632b2008-04-14 04:09:00 -07002048 printk(KERN_WARNING "martian destination " NIPQUAD_FMT " from "
2049 NIPQUAD_FMT ", dev %s\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07002050 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2051#endif
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002052
2053e_hostunreach:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002054 err = -EHOSTUNREACH;
2055 goto done;
Dietmar Eggemann2c2910a2005-06-28 13:06:23 -07002056
Linus Torvalds1da177e2005-04-16 15:20:36 -07002057e_inval:
2058 err = -EINVAL;
2059 goto done;
2060
2061e_nobufs:
2062 err = -ENOBUFS;
2063 goto done;
2064
2065martian_source:
2066 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2067 goto e_inval;
2068}
2069
Al Viro9e12bb22006-09-26 21:25:20 -07002070int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002071 u8 tos, struct net_device *dev)
2072{
2073 struct rtable * rth;
2074 unsigned hash;
2075 int iif = dev->ifindex;
Denis V. Lunevb5921912008-01-22 23:50:25 -08002076 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002077
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09002078 net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002079 tos &= IPTOS_RT_MASK;
Al Viro8c7bc842006-09-26 21:26:19 -07002080 hash = rt_hash(daddr, saddr, iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002081
2082 rcu_read_lock();
2083 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08002084 rth = rcu_dereference(rth->u.dst.rt_next)) {
Stephen Hemmingerc0b8c322008-04-10 04:00:28 -07002085 if (((rth->fl.fl4_dst ^ daddr) |
2086 (rth->fl.fl4_src ^ saddr) |
2087 (rth->fl.iif ^ iif) |
2088 rth->fl.oif |
2089 (rth->fl.fl4_tos ^ tos)) == 0 &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002090 rth->fl.mark == skb->mark &&
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09002091 net_eq(dev_net(rth->u.dst.dev), net) &&
Eric Dumazet29e75252008-01-31 17:05:09 -08002092 rth->rt_genid == atomic_read(&rt_genid)) {
Pavel Emelyanov03f49f32007-11-10 21:28:34 -08002093 dst_use(&rth->u.dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002094 RT_CACHE_STAT_INC(in_hit);
2095 rcu_read_unlock();
Eric Dumazetee6b9672008-03-05 18:30:47 -08002096 skb->rtable = rth;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002097 return 0;
2098 }
2099 RT_CACHE_STAT_INC(in_hlist_search);
2100 }
2101 rcu_read_unlock();
2102
2103 /* Multicast recognition logic is moved from route cache to here.
2104 The problem was that too many Ethernet cards have broken/missing
2105 hardware multicast filters :-( As result the host on multicasting
2106 network acquires a lot of useless route cache entries, sort of
2107 SDR messages from all the world. Now we try to get rid of them.
2108 Really, provided software IP multicast filter is organized
2109 reasonably (at least, hashed), it does not result in a slowdown
2110 comparing with route cache reject entries.
2111 Note, that multicast routers are not affected, because
2112 route cache entry is created eventually.
2113 */
Joe Perchesf97c1e02007-12-16 13:45:43 -08002114 if (ipv4_is_multicast(daddr)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002115 struct in_device *in_dev;
2116
2117 rcu_read_lock();
Herbert Xue5ed6392005-10-03 14:35:55 -07002118 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002119 int our = ip_check_mc(in_dev, daddr, saddr,
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07002120 ip_hdr(skb)->protocol);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002121 if (our
2122#ifdef CONFIG_IP_MROUTE
Joe Perchesf97c1e02007-12-16 13:45:43 -08002123 || (!ipv4_is_local_multicast(daddr) &&
2124 IN_DEV_MFORWARD(in_dev))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002125#endif
2126 ) {
2127 rcu_read_unlock();
2128 return ip_route_input_mc(skb, daddr, saddr,
2129 tos, dev, our);
2130 }
2131 }
2132 rcu_read_unlock();
2133 return -EINVAL;
2134 }
2135 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2136}
2137
Stephen Hemminger5969f712008-04-10 01:52:09 -07002138static int __mkroute_output(struct rtable **result,
2139 struct fib_result *res,
2140 const struct flowi *fl,
2141 const struct flowi *oldflp,
2142 struct net_device *dev_out,
2143 unsigned flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002144{
2145 struct rtable *rth;
2146 struct in_device *in_dev;
2147 u32 tos = RT_FL_TOS(oldflp);
2148 int err = 0;
2149
Joe Perchesf97c1e02007-12-16 13:45:43 -08002150 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002151 return -EINVAL;
2152
Al Viroe4485152006-09-26 22:15:01 -07002153 if (fl->fl4_dst == htonl(0xFFFFFFFF))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002154 res->type = RTN_BROADCAST;
Joe Perchesf97c1e02007-12-16 13:45:43 -08002155 else if (ipv4_is_multicast(fl->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002156 res->type = RTN_MULTICAST;
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002157 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002158 return -EINVAL;
2159
2160 if (dev_out->flags & IFF_LOOPBACK)
2161 flags |= RTCF_LOCAL;
2162
2163 /* get work reference to inet device */
2164 in_dev = in_dev_get(dev_out);
2165 if (!in_dev)
2166 return -EINVAL;
2167
2168 if (res->type == RTN_BROADCAST) {
2169 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2170 if (res->fi) {
2171 fib_info_put(res->fi);
2172 res->fi = NULL;
2173 }
2174 } else if (res->type == RTN_MULTICAST) {
2175 flags |= RTCF_MULTICAST|RTCF_LOCAL;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002176 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002177 oldflp->proto))
2178 flags &= ~RTCF_LOCAL;
2179 /* If multicast route do not exist use
2180 default one, but do not gateway in this case.
2181 Yes, it is hack.
2182 */
2183 if (res->fi && res->prefixlen < 4) {
2184 fib_info_put(res->fi);
2185 res->fi = NULL;
2186 }
2187 }
2188
2189
2190 rth = dst_alloc(&ipv4_dst_ops);
2191 if (!rth) {
2192 err = -ENOBUFS;
2193 goto cleanup;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002194 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002195
Julian Anastasovce723d82005-09-08 13:34:47 -07002196 atomic_set(&rth->u.dst.__refcnt, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002197 rth->u.dst.flags= DST_HOST;
Herbert Xu42f811b2007-06-04 23:34:44 -07002198 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002199 rth->u.dst.flags |= DST_NOXFRM;
Herbert Xu42f811b2007-06-04 23:34:44 -07002200 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002201 rth->u.dst.flags |= DST_NOPOLICY;
2202
2203 rth->fl.fl4_dst = oldflp->fl4_dst;
2204 rth->fl.fl4_tos = tos;
2205 rth->fl.fl4_src = oldflp->fl4_src;
2206 rth->fl.oif = oldflp->oif;
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002207 rth->fl.mark = oldflp->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002208 rth->rt_dst = fl->fl4_dst;
2209 rth->rt_src = fl->fl4_src;
2210 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002211 /* get references to the devices that are to be hold by the routing
Linus Torvalds1da177e2005-04-16 15:20:36 -07002212 cache entry */
2213 rth->u.dst.dev = dev_out;
2214 dev_hold(dev_out);
2215 rth->idev = in_dev_get(dev_out);
2216 rth->rt_gateway = fl->fl4_dst;
2217 rth->rt_spec_dst= fl->fl4_src;
2218
2219 rth->u.dst.output=ip_output;
Eric Dumazet29e75252008-01-31 17:05:09 -08002220 rth->rt_genid = atomic_read(&rt_genid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002221
2222 RT_CACHE_STAT_INC(out_slow_tot);
2223
2224 if (flags & RTCF_LOCAL) {
2225 rth->u.dst.input = ip_local_deliver;
2226 rth->rt_spec_dst = fl->fl4_dst;
2227 }
2228 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2229 rth->rt_spec_dst = fl->fl4_src;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002230 if (flags & RTCF_LOCAL &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002231 !(dev_out->flags & IFF_LOOPBACK)) {
2232 rth->u.dst.output = ip_mc_output;
2233 RT_CACHE_STAT_INC(out_slow_mc);
2234 }
2235#ifdef CONFIG_IP_MROUTE
2236 if (res->type == RTN_MULTICAST) {
2237 if (IN_DEV_MFORWARD(in_dev) &&
Joe Perchesf97c1e02007-12-16 13:45:43 -08002238 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002239 rth->u.dst.input = ip_mr_input;
2240 rth->u.dst.output = ip_mc_output;
2241 }
2242 }
2243#endif
2244 }
2245
2246 rt_set_nexthop(rth, res, 0);
2247
2248 rth->rt_flags = flags;
2249
2250 *result = rth;
2251 cleanup:
2252 /* release work reference to inet device */
2253 in_dev_put(in_dev);
2254
2255 return err;
2256}
2257
Stephen Hemminger5969f712008-04-10 01:52:09 -07002258static int ip_mkroute_output(struct rtable **rp,
2259 struct fib_result *res,
2260 const struct flowi *fl,
2261 const struct flowi *oldflp,
2262 struct net_device *dev_out,
2263 unsigned flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002264{
Chuck Short7abaa272005-06-22 22:10:23 -07002265 struct rtable *rth = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002266 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2267 unsigned hash;
2268 if (err == 0) {
Al Viro8c7bc842006-09-26 21:26:19 -07002269 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002270 err = rt_intern_hash(hash, rth, rp);
2271 }
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002272
Linus Torvalds1da177e2005-04-16 15:20:36 -07002273 return err;
2274}
2275
Linus Torvalds1da177e2005-04-16 15:20:36 -07002276/*
2277 * Major route resolver routine.
2278 */
2279
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002280static int ip_route_output_slow(struct net *net, struct rtable **rp,
2281 const struct flowi *oldflp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002282{
2283 u32 tos = RT_FL_TOS(oldflp);
2284 struct flowi fl = { .nl_u = { .ip4_u =
2285 { .daddr = oldflp->fl4_dst,
2286 .saddr = oldflp->fl4_src,
2287 .tos = tos & IPTOS_RT_MASK,
2288 .scope = ((tos & RTO_ONLINK) ?
2289 RT_SCOPE_LINK :
2290 RT_SCOPE_UNIVERSE),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002291 } },
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002292 .mark = oldflp->mark,
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002293 .iif = net->loopback_dev->ifindex,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002294 .oif = oldflp->oif };
2295 struct fib_result res;
2296 unsigned flags = 0;
2297 struct net_device *dev_out = NULL;
2298 int free_res = 0;
2299 int err;
2300
2301
2302 res.fi = NULL;
2303#ifdef CONFIG_IP_MULTIPLE_TABLES
2304 res.r = NULL;
2305#endif
2306
2307 if (oldflp->fl4_src) {
2308 err = -EINVAL;
Joe Perchesf97c1e02007-12-16 13:45:43 -08002309 if (ipv4_is_multicast(oldflp->fl4_src) ||
Jan Engelhardt1e637c72008-01-21 03:18:08 -08002310 ipv4_is_lbcast(oldflp->fl4_src) ||
Joe Perchesf97c1e02007-12-16 13:45:43 -08002311 ipv4_is_zeronet(oldflp->fl4_src))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002312 goto out;
2313
2314 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002315 dev_out = ip_dev_find(net, oldflp->fl4_src);
David S. Millerf6c5d732007-05-18 02:07:50 -07002316 if (dev_out == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002317 goto out;
2318
2319 /* I removed check for oif == dev_out->oif here.
2320 It was wrong for two reasons:
Denis V. Lunev1ab35272008-01-22 22:04:30 -08002321 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2322 is assigned to multiple interfaces.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002323 2. Moreover, we are allowed to send packets with saddr
2324 of another iface. --ANK
2325 */
2326
David S. Millerf6c5d732007-05-18 02:07:50 -07002327 if (oldflp->oif == 0
Joe Perchesf97c1e02007-12-16 13:45:43 -08002328 && (ipv4_is_multicast(oldflp->fl4_dst) ||
2329 oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002330 /* Special hack: user can direct multicasts
2331 and limited broadcast via necessary interface
2332 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2333 This hack is not just for fun, it allows
2334 vic,vat and friends to work.
2335 They bind socket to loopback, set ttl to zero
2336 and expect that it will work.
2337 From the viewpoint of routing cache they are broken,
2338 because we are not allowed to build multicast path
2339 with loopback source addr (look, routing cache
2340 cannot know, that ttl is zero, so that packet
2341 will not leave this host and route is valid).
2342 Luckily, this hack is good workaround.
2343 */
2344
2345 fl.oif = dev_out->ifindex;
2346 goto make_route;
2347 }
2348 if (dev_out)
2349 dev_put(dev_out);
2350 dev_out = NULL;
2351 }
2352
2353
2354 if (oldflp->oif) {
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002355 dev_out = dev_get_by_index(net, oldflp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002356 err = -ENODEV;
2357 if (dev_out == NULL)
2358 goto out;
Herbert Xue5ed6392005-10-03 14:35:55 -07002359
2360 /* RACE: Check return value of inet_select_addr instead. */
2361 if (__in_dev_get_rtnl(dev_out) == NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002362 dev_put(dev_out);
2363 goto out; /* Wrong error code */
2364 }
2365
Joe Perchesf97c1e02007-12-16 13:45:43 -08002366 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2367 oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002368 if (!fl.fl4_src)
2369 fl.fl4_src = inet_select_addr(dev_out, 0,
2370 RT_SCOPE_LINK);
2371 goto make_route;
2372 }
2373 if (!fl.fl4_src) {
Joe Perchesf97c1e02007-12-16 13:45:43 -08002374 if (ipv4_is_multicast(oldflp->fl4_dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002375 fl.fl4_src = inet_select_addr(dev_out, 0,
2376 fl.fl4_scope);
2377 else if (!oldflp->fl4_dst)
2378 fl.fl4_src = inet_select_addr(dev_out, 0,
2379 RT_SCOPE_HOST);
2380 }
2381 }
2382
2383 if (!fl.fl4_dst) {
2384 fl.fl4_dst = fl.fl4_src;
2385 if (!fl.fl4_dst)
2386 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2387 if (dev_out)
2388 dev_put(dev_out);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002389 dev_out = net->loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002390 dev_hold(dev_out);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002391 fl.oif = net->loopback_dev->ifindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002392 res.type = RTN_LOCAL;
2393 flags |= RTCF_LOCAL;
2394 goto make_route;
2395 }
2396
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002397 if (fib_lookup(net, &fl, &res)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002398 res.fi = NULL;
2399 if (oldflp->oif) {
2400 /* Apparently, routing tables are wrong. Assume,
2401 that the destination is on link.
2402
2403 WHY? DW.
2404 Because we are allowed to send to iface
2405 even if it has NO routes and NO assigned
2406 addresses. When oif is specified, routing
2407 tables are looked up with only one purpose:
2408 to catch if destination is gatewayed, rather than
2409 direct. Moreover, if MSG_DONTROUTE is set,
2410 we send packet, ignoring both routing tables
2411 and ifaddr state. --ANK
2412
2413
2414 We could make it even if oif is unknown,
2415 likely IPv6, but we do not.
2416 */
2417
2418 if (fl.fl4_src == 0)
2419 fl.fl4_src = inet_select_addr(dev_out, 0,
2420 RT_SCOPE_LINK);
2421 res.type = RTN_UNICAST;
2422 goto make_route;
2423 }
2424 if (dev_out)
2425 dev_put(dev_out);
2426 err = -ENETUNREACH;
2427 goto out;
2428 }
2429 free_res = 1;
2430
2431 if (res.type == RTN_LOCAL) {
2432 if (!fl.fl4_src)
2433 fl.fl4_src = fl.fl4_dst;
2434 if (dev_out)
2435 dev_put(dev_out);
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002436 dev_out = net->loopback_dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002437 dev_hold(dev_out);
2438 fl.oif = dev_out->ifindex;
2439 if (res.fi)
2440 fib_info_put(res.fi);
2441 res.fi = NULL;
2442 flags |= RTCF_LOCAL;
2443 goto make_route;
2444 }
2445
2446#ifdef CONFIG_IP_ROUTE_MULTIPATH
2447 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2448 fib_select_multipath(&fl, &res);
2449 else
2450#endif
2451 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
Denis V. Lunevb40afd02008-01-22 22:06:19 -08002452 fib_select_default(net, &fl, &res);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002453
2454 if (!fl.fl4_src)
2455 fl.fl4_src = FIB_RES_PREFSRC(res);
2456
2457 if (dev_out)
2458 dev_put(dev_out);
2459 dev_out = FIB_RES_DEV(res);
2460 dev_hold(dev_out);
2461 fl.oif = dev_out->ifindex;
2462
2463
2464make_route:
2465 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2466
2467
2468 if (free_res)
2469 fib_res_put(&res);
2470 if (dev_out)
2471 dev_put(dev_out);
2472out: return err;
2473}
2474
Denis V. Lunev611c1832008-01-22 22:06:48 -08002475int __ip_route_output_key(struct net *net, struct rtable **rp,
2476 const struct flowi *flp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002477{
2478 unsigned hash;
2479 struct rtable *rth;
2480
Al Viro8c7bc842006-09-26 21:26:19 -07002481 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002482
2483 rcu_read_lock_bh();
2484 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08002485 rth = rcu_dereference(rth->u.dst.rt_next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002486 if (rth->fl.fl4_dst == flp->fl4_dst &&
2487 rth->fl.fl4_src == flp->fl4_src &&
2488 rth->fl.iif == 0 &&
2489 rth->fl.oif == flp->oif &&
Thomas Graf47dcf0c2006-11-09 15:20:38 -08002490 rth->fl.mark == flp->mark &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002491 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
Denis V. Lunevb5921912008-01-22 23:50:25 -08002492 (IPTOS_RT_MASK | RTO_ONLINK)) &&
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09002493 net_eq(dev_net(rth->u.dst.dev), net) &&
Eric Dumazet29e75252008-01-31 17:05:09 -08002494 rth->rt_genid == atomic_read(&rt_genid)) {
Pavel Emelyanov03f49f32007-11-10 21:28:34 -08002495 dst_use(&rth->u.dst, jiffies);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002496 RT_CACHE_STAT_INC(out_hit);
2497 rcu_read_unlock_bh();
2498 *rp = rth;
2499 return 0;
2500 }
2501 RT_CACHE_STAT_INC(out_hlist_search);
2502 }
2503 rcu_read_unlock_bh();
2504
Denis V. Lunev611c1832008-01-22 22:06:48 -08002505 return ip_route_output_slow(net, rp, flp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002506}
2507
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002508EXPORT_SYMBOL_GPL(__ip_route_output_key);
2509
David S. Miller14e50e52007-05-24 18:17:54 -07002510static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2511{
2512}
2513
2514static struct dst_ops ipv4_dst_blackhole_ops = {
2515 .family = AF_INET,
2516 .protocol = __constant_htons(ETH_P_IP),
2517 .destroy = ipv4_dst_destroy,
2518 .check = ipv4_dst_check,
2519 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2520 .entry_size = sizeof(struct rtable),
Eric Dumazete2422972008-01-30 20:07:45 -08002521 .entries = ATOMIC_INIT(0),
David S. Miller14e50e52007-05-24 18:17:54 -07002522};
2523
2524
Denis V. Lunevce259992008-03-22 17:42:37 -07002525static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp)
David S. Miller14e50e52007-05-24 18:17:54 -07002526{
2527 struct rtable *ort = *rp;
2528 struct rtable *rt = (struct rtable *)
2529 dst_alloc(&ipv4_dst_blackhole_ops);
2530
2531 if (rt) {
2532 struct dst_entry *new = &rt->u.dst;
2533
2534 atomic_set(&new->__refcnt, 1);
2535 new->__use = 1;
Herbert Xu352e5122007-11-13 21:34:06 -08002536 new->input = dst_discard;
2537 new->output = dst_discard;
David S. Miller14e50e52007-05-24 18:17:54 -07002538 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2539
2540 new->dev = ort->u.dst.dev;
2541 if (new->dev)
2542 dev_hold(new->dev);
2543
2544 rt->fl = ort->fl;
2545
2546 rt->idev = ort->idev;
2547 if (rt->idev)
2548 in_dev_hold(rt->idev);
Eric Dumazet29e75252008-01-31 17:05:09 -08002549 rt->rt_genid = atomic_read(&rt_genid);
David S. Miller14e50e52007-05-24 18:17:54 -07002550 rt->rt_flags = ort->rt_flags;
2551 rt->rt_type = ort->rt_type;
2552 rt->rt_dst = ort->rt_dst;
2553 rt->rt_src = ort->rt_src;
2554 rt->rt_iif = ort->rt_iif;
2555 rt->rt_gateway = ort->rt_gateway;
2556 rt->rt_spec_dst = ort->rt_spec_dst;
2557 rt->peer = ort->peer;
2558 if (rt->peer)
2559 atomic_inc(&rt->peer->refcnt);
2560
2561 dst_free(new);
2562 }
2563
2564 dst_release(&(*rp)->u.dst);
2565 *rp = rt;
2566 return (rt ? 0 : -ENOMEM);
2567}
2568
Denis V. Lunevf1b050b2008-01-22 22:07:10 -08002569int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2570 struct sock *sk, int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002571{
2572 int err;
2573
Denis V. Lunevf1b050b2008-01-22 22:07:10 -08002574 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002575 return err;
2576
2577 if (flp->proto) {
2578 if (!flp->fl4_src)
2579 flp->fl4_src = (*rp)->rt_src;
2580 if (!flp->fl4_dst)
2581 flp->fl4_dst = (*rp)->rt_dst;
Herbert Xubb728452007-12-12 18:48:58 -08002582 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2583 flags ? XFRM_LOOKUP_WAIT : 0);
David S. Miller14e50e52007-05-24 18:17:54 -07002584 if (err == -EREMOTE)
Denis V. Lunevce259992008-03-22 17:42:37 -07002585 err = ipv4_dst_blackhole(rp, flp);
David S. Miller14e50e52007-05-24 18:17:54 -07002586
2587 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002588 }
2589
2590 return 0;
2591}
2592
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -07002593EXPORT_SYMBOL_GPL(ip_route_output_flow);
2594
Denis V. Lunevf2063512008-01-22 22:07:34 -08002595int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002596{
Denis V. Lunevf2063512008-01-22 22:07:34 -08002597 return ip_route_output_flow(net, rp, flp, NULL, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002598}
2599
2600static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002601 int nowait, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002602{
Eric Dumazetee6b9672008-03-05 18:30:47 -08002603 struct rtable *rt = skb->rtable;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002604 struct rtmsg *r;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002605 struct nlmsghdr *nlh;
Thomas Grafe3703b32006-11-27 09:27:07 -08002606 long expires;
2607 u32 id = 0, ts = 0, tsage = 0, error;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002608
2609 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2610 if (nlh == NULL)
Patrick McHardy26932562007-01-31 23:16:40 -08002611 return -EMSGSIZE;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002612
2613 r = nlmsg_data(nlh);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002614 r->rtm_family = AF_INET;
2615 r->rtm_dst_len = 32;
2616 r->rtm_src_len = 0;
2617 r->rtm_tos = rt->fl.fl4_tos;
2618 r->rtm_table = RT_TABLE_MAIN;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002619 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002620 r->rtm_type = rt->rt_type;
2621 r->rtm_scope = RT_SCOPE_UNIVERSE;
2622 r->rtm_protocol = RTPROT_UNSPEC;
2623 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2624 if (rt->rt_flags & RTCF_NOTIFY)
2625 r->rtm_flags |= RTM_F_NOTIFY;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002626
Al Viro17fb2c62006-09-26 22:15:25 -07002627 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002628
Linus Torvalds1da177e2005-04-16 15:20:36 -07002629 if (rt->fl.fl4_src) {
2630 r->rtm_src_len = 32;
Al Viro17fb2c62006-09-26 22:15:25 -07002631 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002632 }
2633 if (rt->u.dst.dev)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002634 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002635#ifdef CONFIG_NET_CLS_ROUTE
2636 if (rt->u.dst.tclassid)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002637 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002638#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002639 if (rt->fl.iif)
Al Viro17fb2c62006-09-26 22:15:25 -07002640 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002641 else if (rt->rt_src != rt->fl.fl4_src)
Al Viro17fb2c62006-09-26 22:15:25 -07002642 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002643
Linus Torvalds1da177e2005-04-16 15:20:36 -07002644 if (rt->rt_dst != rt->rt_gateway)
Al Viro17fb2c62006-09-26 22:15:25 -07002645 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
Thomas Grafbe403ea2006-08-17 18:15:17 -07002646
Linus Torvalds1da177e2005-04-16 15:20:36 -07002647 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002648 goto nla_put_failure;
2649
Thomas Grafe3703b32006-11-27 09:27:07 -08002650 error = rt->u.dst.error;
2651 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002652 if (rt->peer) {
Thomas Grafe3703b32006-11-27 09:27:07 -08002653 id = rt->peer->ip_id_count;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002654 if (rt->peer->tcp_ts_stamp) {
Thomas Grafe3703b32006-11-27 09:27:07 -08002655 ts = rt->peer->tcp_ts;
James Morris9d729f72007-03-04 16:12:44 -08002656 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002657 }
2658 }
Thomas Grafbe403ea2006-08-17 18:15:17 -07002659
Linus Torvalds1da177e2005-04-16 15:20:36 -07002660 if (rt->fl.iif) {
2661#ifdef CONFIG_IP_MROUTE
Al Viroe4485152006-09-26 22:15:01 -07002662 __be32 dst = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002663
Joe Perchesf97c1e02007-12-16 13:45:43 -08002664 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
Pavel Emelyanov586f1212007-12-16 13:32:48 -08002665 IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002666 int err = ipmr_get_route(skb, r, nowait);
2667 if (err <= 0) {
2668 if (!nowait) {
2669 if (err == 0)
2670 return 0;
Thomas Grafbe403ea2006-08-17 18:15:17 -07002671 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002672 } else {
2673 if (err == -EMSGSIZE)
Thomas Grafbe403ea2006-08-17 18:15:17 -07002674 goto nla_put_failure;
Thomas Grafe3703b32006-11-27 09:27:07 -08002675 error = err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002676 }
2677 }
2678 } else
2679#endif
Thomas Grafbe403ea2006-08-17 18:15:17 -07002680 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002681 }
2682
Thomas Grafe3703b32006-11-27 09:27:07 -08002683 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2684 expires, error) < 0)
2685 goto nla_put_failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002686
Thomas Grafbe403ea2006-08-17 18:15:17 -07002687 return nlmsg_end(skb, nlh);
2688
2689nla_put_failure:
Patrick McHardy26932562007-01-31 23:16:40 -08002690 nlmsg_cancel(skb, nlh);
2691 return -EMSGSIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002692}
2693
Thomas Graf63f34442007-03-22 11:55:17 -07002694static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002695{
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09002696 struct net *net = sock_net(in_skb->sk);
Thomas Grafd889ce32006-08-17 18:15:44 -07002697 struct rtmsg *rtm;
2698 struct nlattr *tb[RTA_MAX+1];
Linus Torvalds1da177e2005-04-16 15:20:36 -07002699 struct rtable *rt = NULL;
Al Viro9e12bb22006-09-26 21:25:20 -07002700 __be32 dst = 0;
2701 __be32 src = 0;
2702 u32 iif;
Thomas Grafd889ce32006-08-17 18:15:44 -07002703 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002704 struct sk_buff *skb;
2705
Thomas Grafd889ce32006-08-17 18:15:44 -07002706 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2707 if (err < 0)
2708 goto errout;
2709
2710 rtm = nlmsg_data(nlh);
2711
Linus Torvalds1da177e2005-04-16 15:20:36 -07002712 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
Thomas Grafd889ce32006-08-17 18:15:44 -07002713 if (skb == NULL) {
2714 err = -ENOBUFS;
2715 goto errout;
2716 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002717
2718 /* Reserve room for dummy headers, this skb can pass
2719 through good chunk of routing engine.
2720 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07002721 skb_reset_mac_header(skb);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07002722 skb_reset_network_header(skb);
Stephen Hemmingerd2c962b2006-04-17 17:27:11 -07002723
2724 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07002725 ip_hdr(skb)->protocol = IPPROTO_ICMP;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002726 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2727
Al Viro17fb2c62006-09-26 22:15:25 -07002728 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2729 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
Thomas Grafd889ce32006-08-17 18:15:44 -07002730 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002731
2732 if (iif) {
Thomas Grafd889ce32006-08-17 18:15:44 -07002733 struct net_device *dev;
2734
Denis V. Lunev19375042008-02-28 20:52:04 -08002735 dev = __dev_get_by_index(net, iif);
Thomas Grafd889ce32006-08-17 18:15:44 -07002736 if (dev == NULL) {
2737 err = -ENODEV;
2738 goto errout_free;
2739 }
2740
Linus Torvalds1da177e2005-04-16 15:20:36 -07002741 skb->protocol = htons(ETH_P_IP);
2742 skb->dev = dev;
2743 local_bh_disable();
2744 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2745 local_bh_enable();
Thomas Grafd889ce32006-08-17 18:15:44 -07002746
Eric Dumazetee6b9672008-03-05 18:30:47 -08002747 rt = skb->rtable;
Thomas Grafd889ce32006-08-17 18:15:44 -07002748 if (err == 0 && rt->u.dst.error)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002749 err = -rt->u.dst.error;
2750 } else {
Thomas Grafd889ce32006-08-17 18:15:44 -07002751 struct flowi fl = {
2752 .nl_u = {
2753 .ip4_u = {
2754 .daddr = dst,
2755 .saddr = src,
2756 .tos = rtm->rtm_tos,
2757 },
2758 },
2759 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2760 };
Denis V. Lunev19375042008-02-28 20:52:04 -08002761 err = ip_route_output_key(net, &rt, &fl);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002762 }
Thomas Grafd889ce32006-08-17 18:15:44 -07002763
Linus Torvalds1da177e2005-04-16 15:20:36 -07002764 if (err)
Thomas Grafd889ce32006-08-17 18:15:44 -07002765 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002766
Eric Dumazetee6b9672008-03-05 18:30:47 -08002767 skb->rtable = rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002768 if (rtm->rtm_flags & RTM_F_NOTIFY)
2769 rt->rt_flags |= RTCF_NOTIFY;
2770
Linus Torvalds1da177e2005-04-16 15:20:36 -07002771 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
Denis V. Lunev19375042008-02-28 20:52:04 -08002772 RTM_NEWROUTE, 0, 0);
Thomas Grafd889ce32006-08-17 18:15:44 -07002773 if (err <= 0)
2774 goto errout_free;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002775
Denis V. Lunev19375042008-02-28 20:52:04 -08002776 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
Thomas Grafd889ce32006-08-17 18:15:44 -07002777errout:
Thomas Graf2942e902006-08-15 00:30:25 -07002778 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002779
Thomas Grafd889ce32006-08-17 18:15:44 -07002780errout_free:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002781 kfree_skb(skb);
Thomas Grafd889ce32006-08-17 18:15:44 -07002782 goto errout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002783}
2784
2785int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2786{
2787 struct rtable *rt;
2788 int h, s_h;
2789 int idx, s_idx;
Denis V. Lunev19375042008-02-28 20:52:04 -08002790 struct net *net;
2791
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09002792 net = sock_net(skb->sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002793
2794 s_h = cb->args[0];
Eric Dumazetd8c92832008-01-07 21:52:14 -08002795 if (s_h < 0)
2796 s_h = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002797 s_idx = idx = cb->args[1];
Eric Dumazetd8c92832008-01-07 21:52:14 -08002798 for (h = s_h; h <= rt_hash_mask; h++) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002799 rcu_read_lock_bh();
2800 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
Eric Dumazet093c2ca2007-02-09 16:19:26 -08002801 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09002802 if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002803 continue;
Eric Dumazet29e75252008-01-31 17:05:09 -08002804 if (rt->rt_genid != atomic_read(&rt_genid))
2805 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002806 skb->dst = dst_clone(&rt->u.dst);
2807 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002808 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
Jamal Hadi Salimb6544c02005-06-18 22:54:12 -07002809 1, NLM_F_MULTI) <= 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002810 dst_release(xchg(&skb->dst, NULL));
2811 rcu_read_unlock_bh();
2812 goto done;
2813 }
2814 dst_release(xchg(&skb->dst, NULL));
2815 }
2816 rcu_read_unlock_bh();
Eric Dumazetd8c92832008-01-07 21:52:14 -08002817 s_idx = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002818 }
2819
2820done:
2821 cb->args[0] = h;
2822 cb->args[1] = idx;
2823 return skb->len;
2824}
2825
2826void ip_rt_multicast_event(struct in_device *in_dev)
2827{
Denis V. Lunev76e6ebf2008-07-05 19:00:44 -07002828 rt_cache_flush(dev_net(in_dev->dev), 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002829}
2830
2831#ifdef CONFIG_SYSCTL
Linus Torvalds1da177e2005-04-16 15:20:36 -07002832static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2833 struct file *filp, void __user *buffer,
2834 size_t *lenp, loff_t *ppos)
2835{
2836 if (write) {
Denis V. Lunev639e1042008-07-05 19:02:06 -07002837 int flush_delay;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07002838 struct net *net;
Denis V. Lunev639e1042008-07-05 19:02:06 -07002839 static DEFINE_MUTEX(flush_mutex);
2840
2841 mutex_lock(&flush_mutex);
2842 ctl->data = &flush_delay;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002843 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
Denis V. Lunev639e1042008-07-05 19:02:06 -07002844 ctl->data = NULL;
2845 mutex_unlock(&flush_mutex);
2846
Denis V. Lunev39a23e72008-07-05 19:02:33 -07002847 net = (struct net *)ctl->extra1;
2848 rt_cache_flush(net, flush_delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002849 return 0;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002850 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002851
2852 return -EINVAL;
2853}
2854
2855static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2856 int __user *name,
2857 int nlen,
2858 void __user *oldval,
2859 size_t __user *oldlenp,
2860 void __user *newval,
Alexey Dobriyan1f29bcd2006-12-10 02:19:10 -08002861 size_t newlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002862{
2863 int delay;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07002864 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002865 if (newlen != sizeof(int))
2866 return -EINVAL;
2867 if (get_user(delay, (int __user *)newval))
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002868 return -EFAULT;
Denis V. Lunev39a23e72008-07-05 19:02:33 -07002869 net = (struct net *)table->extra1;
2870 rt_cache_flush(net, delay);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002871 return 0;
2872}
2873
2874ctl_table ipv4_route_table[] = {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002875 {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002876 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2877 .procname = "gc_thresh",
2878 .data = &ipv4_dst_ops.gc_thresh,
2879 .maxlen = sizeof(int),
2880 .mode = 0644,
2881 .proc_handler = &proc_dointvec,
2882 },
2883 {
2884 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2885 .procname = "max_size",
2886 .data = &ip_rt_max_size,
2887 .maxlen = sizeof(int),
2888 .mode = 0644,
2889 .proc_handler = &proc_dointvec,
2890 },
2891 {
2892 /* Deprecated. Use gc_min_interval_ms */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09002893
Linus Torvalds1da177e2005-04-16 15:20:36 -07002894 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2895 .procname = "gc_min_interval",
2896 .data = &ip_rt_gc_min_interval,
2897 .maxlen = sizeof(int),
2898 .mode = 0644,
2899 .proc_handler = &proc_dointvec_jiffies,
2900 .strategy = &sysctl_jiffies,
2901 },
2902 {
2903 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2904 .procname = "gc_min_interval_ms",
2905 .data = &ip_rt_gc_min_interval,
2906 .maxlen = sizeof(int),
2907 .mode = 0644,
2908 .proc_handler = &proc_dointvec_ms_jiffies,
2909 .strategy = &sysctl_ms_jiffies,
2910 },
2911 {
2912 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2913 .procname = "gc_timeout",
2914 .data = &ip_rt_gc_timeout,
2915 .maxlen = sizeof(int),
2916 .mode = 0644,
2917 .proc_handler = &proc_dointvec_jiffies,
2918 .strategy = &sysctl_jiffies,
2919 },
2920 {
2921 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2922 .procname = "gc_interval",
2923 .data = &ip_rt_gc_interval,
2924 .maxlen = sizeof(int),
2925 .mode = 0644,
2926 .proc_handler = &proc_dointvec_jiffies,
2927 .strategy = &sysctl_jiffies,
2928 },
2929 {
2930 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2931 .procname = "redirect_load",
2932 .data = &ip_rt_redirect_load,
2933 .maxlen = sizeof(int),
2934 .mode = 0644,
2935 .proc_handler = &proc_dointvec,
2936 },
2937 {
2938 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2939 .procname = "redirect_number",
2940 .data = &ip_rt_redirect_number,
2941 .maxlen = sizeof(int),
2942 .mode = 0644,
2943 .proc_handler = &proc_dointvec,
2944 },
2945 {
2946 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2947 .procname = "redirect_silence",
2948 .data = &ip_rt_redirect_silence,
2949 .maxlen = sizeof(int),
2950 .mode = 0644,
2951 .proc_handler = &proc_dointvec,
2952 },
2953 {
2954 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2955 .procname = "error_cost",
2956 .data = &ip_rt_error_cost,
2957 .maxlen = sizeof(int),
2958 .mode = 0644,
2959 .proc_handler = &proc_dointvec,
2960 },
2961 {
2962 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2963 .procname = "error_burst",
2964 .data = &ip_rt_error_burst,
2965 .maxlen = sizeof(int),
2966 .mode = 0644,
2967 .proc_handler = &proc_dointvec,
2968 },
2969 {
2970 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
2971 .procname = "gc_elasticity",
2972 .data = &ip_rt_gc_elasticity,
2973 .maxlen = sizeof(int),
2974 .mode = 0644,
2975 .proc_handler = &proc_dointvec,
2976 },
2977 {
2978 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
2979 .procname = "mtu_expires",
2980 .data = &ip_rt_mtu_expires,
2981 .maxlen = sizeof(int),
2982 .mode = 0644,
2983 .proc_handler = &proc_dointvec_jiffies,
2984 .strategy = &sysctl_jiffies,
2985 },
2986 {
2987 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
2988 .procname = "min_pmtu",
2989 .data = &ip_rt_min_pmtu,
2990 .maxlen = sizeof(int),
2991 .mode = 0644,
2992 .proc_handler = &proc_dointvec,
2993 },
2994 {
2995 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
2996 .procname = "min_adv_mss",
2997 .data = &ip_rt_min_advmss,
2998 .maxlen = sizeof(int),
2999 .mode = 0644,
3000 .proc_handler = &proc_dointvec,
3001 },
3002 {
3003 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
3004 .procname = "secret_interval",
3005 .data = &ip_rt_secret_interval,
3006 .maxlen = sizeof(int),
3007 .mode = 0644,
3008 .proc_handler = &proc_dointvec_jiffies,
3009 .strategy = &sysctl_jiffies,
3010 },
3011 { .ctl_name = 0 }
3012};
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003013
3014static __net_initdata struct ctl_path ipv4_route_path[] = {
3015 { .procname = "net", .ctl_name = CTL_NET, },
3016 { .procname = "ipv4", .ctl_name = NET_IPV4, },
3017 { .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
3018 { },
3019};
3020
3021
3022static struct ctl_table ipv4_route_flush_table[] = {
3023 {
3024 .ctl_name = NET_IPV4_ROUTE_FLUSH,
3025 .procname = "flush",
3026 .maxlen = sizeof(int),
3027 .mode = 0200,
3028 .proc_handler = &ipv4_sysctl_rtcache_flush,
3029 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
3030 },
3031 { .ctl_name = 0 },
3032};
3033
3034static __net_init int sysctl_route_net_init(struct net *net)
3035{
3036 struct ctl_table *tbl;
3037
3038 tbl = ipv4_route_flush_table;
3039 if (net != &init_net) {
3040 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3041 if (tbl == NULL)
3042 goto err_dup;
3043 }
3044 tbl[0].extra1 = net;
3045
3046 net->ipv4.route_hdr =
3047 register_net_sysctl_table(net, ipv4_route_path, tbl);
3048 if (net->ipv4.route_hdr == NULL)
3049 goto err_reg;
3050 return 0;
3051
3052err_reg:
3053 if (tbl != ipv4_route_flush_table)
3054 kfree(tbl);
3055err_dup:
3056 return -ENOMEM;
3057}
3058
3059static __net_exit void sysctl_route_net_exit(struct net *net)
3060{
3061 struct ctl_table *tbl;
3062
3063 tbl = net->ipv4.route_hdr->ctl_table_arg;
3064 unregister_net_sysctl_table(net->ipv4.route_hdr);
3065 BUG_ON(tbl == ipv4_route_flush_table);
3066 kfree(tbl);
3067}
3068
3069static __net_initdata struct pernet_operations sysctl_route_ops = {
3070 .init = sysctl_route_net_init,
3071 .exit = sysctl_route_net_exit,
3072};
Linus Torvalds1da177e2005-04-16 15:20:36 -07003073#endif
3074
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003075
3076static __net_init int rt_secret_timer_init(struct net *net)
3077{
3078 net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3079 net->ipv4.rt_secret_timer.data = (unsigned long)net;
3080 init_timer_deferrable(&net->ipv4.rt_secret_timer);
3081
3082 net->ipv4.rt_secret_timer.expires =
3083 jiffies + net_random() % ip_rt_secret_interval +
3084 ip_rt_secret_interval;
3085 add_timer(&net->ipv4.rt_secret_timer);
3086 return 0;
3087}
3088
3089static __net_exit void rt_secret_timer_exit(struct net *net)
3090{
3091 del_timer_sync(&net->ipv4.rt_secret_timer);
3092}
3093
3094static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3095 .init = rt_secret_timer_init,
3096 .exit = rt_secret_timer_exit,
3097};
3098
3099
Linus Torvalds1da177e2005-04-16 15:20:36 -07003100#ifdef CONFIG_NET_CLS_ROUTE
Eric Dumazet8dbde282007-11-16 03:32:10 -08003101struct ip_rt_acct *ip_rt_acct __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003102#endif /* CONFIG_NET_CLS_ROUTE */
3103
3104static __initdata unsigned long rhash_entries;
3105static int __init set_rhash_entries(char *str)
3106{
3107 if (!str)
3108 return 0;
3109 rhash_entries = simple_strtoul(str, &str, 0);
3110 return 1;
3111}
3112__setup("rhash_entries=", set_rhash_entries);
3113
3114int __init ip_rt_init(void)
3115{
Eric Dumazet424c4b72005-07-05 14:58:19 -07003116 int rc = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003117
Eric Dumazet29e75252008-01-31 17:05:09 -08003118 atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^
3119 (jiffies ^ (jiffies >> 7))));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003120
3121#ifdef CONFIG_NET_CLS_ROUTE
Eric Dumazet8dbde282007-11-16 03:32:10 -08003122 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003123 if (!ip_rt_acct)
3124 panic("IP: failed to allocate ip_rt_acct\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003125#endif
3126
Alexey Dobriyane5d679f332006-08-26 19:25:52 -07003127 ipv4_dst_ops.kmem_cachep =
3128 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09003129 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003130
David S. Miller14e50e52007-05-24 18:17:54 -07003131 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3132
Eric Dumazet424c4b72005-07-05 14:58:19 -07003133 rt_hash_table = (struct rt_hash_bucket *)
3134 alloc_large_system_hash("IP route cache",
3135 sizeof(struct rt_hash_bucket),
3136 rhash_entries,
3137 (num_physpages >= 128 * 1024) ?
Mike Stroyan18955cf2005-11-29 16:12:55 -08003138 15 : 17,
Kirill Korotaev8d1502d2006-08-07 20:44:22 -07003139 0,
Eric Dumazet424c4b72005-07-05 14:58:19 -07003140 &rt_hash_log,
3141 &rt_hash_mask,
3142 0);
Eric Dumazet22c047c2005-07-05 14:55:24 -07003143 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3144 rt_hash_lock_init();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003145
3146 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3147 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3148
Linus Torvalds1da177e2005-04-16 15:20:36 -07003149 devinet_init();
3150 ip_fib_init();
3151
Linus Torvalds1da177e2005-04-16 15:20:36 -07003152 /* All the timers, started at system startup tend
3153 to synchronize. Perturb it a bit.
3154 */
Eric Dumazet39c90ec2007-09-15 10:55:54 -07003155 schedule_delayed_work(&expires_work,
3156 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003157
Denis V. Lunev9f5e97e2008-07-05 19:02:59 -07003158 if (register_pernet_subsys(&rt_secret_timer_ops))
3159 printk(KERN_ERR "Unable to setup rt_secret_timer\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003160
Denis V. Lunev73b38712008-02-28 20:51:18 -08003161 if (ip_rt_proc_init())
Pavel Emelyanov107f1632007-12-05 21:14:28 -08003162 printk(KERN_ERR "Unable to create route proc files\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003163#ifdef CONFIG_XFRM
3164 xfrm_init();
3165 xfrm4_init();
3166#endif
Thomas Graf63f34442007-03-22 11:55:17 -07003167 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3168
Denis V. Lunev39a23e72008-07-05 19:02:33 -07003169#ifdef CONFIG_SYSCTL
3170 register_pernet_subsys(&sysctl_route_ops);
3171#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07003172 return rc;
3173}
3174
3175EXPORT_SYMBOL(__ip_select_ident);
3176EXPORT_SYMBOL(ip_route_input);
3177EXPORT_SYMBOL(ip_route_output_key);