blob: 029362d66135124a3327046440b6c94892c4d1ae [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * IPv4 Forwarding Information Base: semantics.
7 *
8 * Version: $Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $
9 *
10 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
16 */
17
18#include <linux/config.h>
19#include <asm/uaccess.h>
20#include <asm/system.h>
21#include <linux/bitops.h>
22#include <linux/types.h>
23#include <linux/kernel.h>
24#include <linux/jiffies.h>
25#include <linux/mm.h>
26#include <linux/string.h>
27#include <linux/socket.h>
28#include <linux/sockios.h>
29#include <linux/errno.h>
30#include <linux/in.h>
31#include <linux/inet.h>
32#include <linux/netdevice.h>
33#include <linux/if_arp.h>
34#include <linux/proc_fs.h>
35#include <linux/skbuff.h>
36#include <linux/netlink.h>
37#include <linux/init.h>
38
39#include <net/ip.h>
40#include <net/protocol.h>
41#include <net/route.h>
42#include <net/tcp.h>
43#include <net/sock.h>
44#include <net/ip_fib.h>
45#include <net/ip_mp_alg.h>
46
47#include "fib_lookup.h"
48
49#define FSprintk(a...)
50
51static DEFINE_RWLOCK(fib_info_lock);
52static struct hlist_head *fib_info_hash;
53static struct hlist_head *fib_info_laddrhash;
54static unsigned int fib_hash_size;
55static unsigned int fib_info_cnt;
56
57#define DEVINDEX_HASHBITS 8
58#define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
59static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
60
61#ifdef CONFIG_IP_ROUTE_MULTIPATH
62
63static DEFINE_SPINLOCK(fib_multipath_lock);
64
65#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
66for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
67
68#define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
69for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
70
71#else /* CONFIG_IP_ROUTE_MULTIPATH */
72
73/* Hope, that gcc will optimize it to get rid of dummy loop */
74
75#define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
76for (nhsel=0; nhsel < 1; nhsel++)
77
78#define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
79for (nhsel=0; nhsel < 1; nhsel++)
80
81#endif /* CONFIG_IP_ROUTE_MULTIPATH */
82
83#define endfor_nexthops(fi) }
84
85
86static struct
87{
88 int error;
89 u8 scope;
90} fib_props[RTA_MAX + 1] = {
91 {
92 .error = 0,
93 .scope = RT_SCOPE_NOWHERE,
94 }, /* RTN_UNSPEC */
95 {
96 .error = 0,
97 .scope = RT_SCOPE_UNIVERSE,
98 }, /* RTN_UNICAST */
99 {
100 .error = 0,
101 .scope = RT_SCOPE_HOST,
102 }, /* RTN_LOCAL */
103 {
104 .error = 0,
105 .scope = RT_SCOPE_LINK,
106 }, /* RTN_BROADCAST */
107 {
108 .error = 0,
109 .scope = RT_SCOPE_LINK,
110 }, /* RTN_ANYCAST */
111 {
112 .error = 0,
113 .scope = RT_SCOPE_UNIVERSE,
114 }, /* RTN_MULTICAST */
115 {
116 .error = -EINVAL,
117 .scope = RT_SCOPE_UNIVERSE,
118 }, /* RTN_BLACKHOLE */
119 {
120 .error = -EHOSTUNREACH,
121 .scope = RT_SCOPE_UNIVERSE,
122 }, /* RTN_UNREACHABLE */
123 {
124 .error = -EACCES,
125 .scope = RT_SCOPE_UNIVERSE,
126 }, /* RTN_PROHIBIT */
127 {
128 .error = -EAGAIN,
129 .scope = RT_SCOPE_UNIVERSE,
130 }, /* RTN_THROW */
131 {
132 .error = -EINVAL,
133 .scope = RT_SCOPE_NOWHERE,
134 }, /* RTN_NAT */
135 {
136 .error = -EINVAL,
137 .scope = RT_SCOPE_NOWHERE,
138 }, /* RTN_XRESOLVE */
139};
140
141
142/* Release a nexthop info record */
143
144void free_fib_info(struct fib_info *fi)
145{
146 if (fi->fib_dead == 0) {
147 printk("Freeing alive fib_info %p\n", fi);
148 return;
149 }
150 change_nexthops(fi) {
151 if (nh->nh_dev)
152 dev_put(nh->nh_dev);
153 nh->nh_dev = NULL;
154 } endfor_nexthops(fi);
155 fib_info_cnt--;
156 kfree(fi);
157}
158
159void fib_release_info(struct fib_info *fi)
160{
161 write_lock(&fib_info_lock);
162 if (fi && --fi->fib_treeref == 0) {
163 hlist_del(&fi->fib_hash);
164 if (fi->fib_prefsrc)
165 hlist_del(&fi->fib_lhash);
166 change_nexthops(fi) {
167 if (!nh->nh_dev)
168 continue;
169 hlist_del(&nh->nh_hash);
170 } endfor_nexthops(fi)
171 fi->fib_dead = 1;
172 fib_info_put(fi);
173 }
174 write_unlock(&fib_info_lock);
175}
176
177static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
178{
179 const struct fib_nh *onh = ofi->fib_nh;
180
181 for_nexthops(fi) {
182 if (nh->nh_oif != onh->nh_oif ||
183 nh->nh_gw != onh->nh_gw ||
184 nh->nh_scope != onh->nh_scope ||
185#ifdef CONFIG_IP_ROUTE_MULTIPATH
186 nh->nh_weight != onh->nh_weight ||
187#endif
188#ifdef CONFIG_NET_CLS_ROUTE
189 nh->nh_tclassid != onh->nh_tclassid ||
190#endif
191 ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
192 return -1;
193 onh++;
194 } endfor_nexthops(fi);
195 return 0;
196}
197
198static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
199{
200 unsigned int mask = (fib_hash_size - 1);
201 unsigned int val = fi->fib_nhs;
202
203 val ^= fi->fib_protocol;
204 val ^= fi->fib_prefsrc;
205 val ^= fi->fib_priority;
206
207 return (val ^ (val >> 7) ^ (val >> 12)) & mask;
208}
209
210static struct fib_info *fib_find_info(const struct fib_info *nfi)
211{
212 struct hlist_head *head;
213 struct hlist_node *node;
214 struct fib_info *fi;
215 unsigned int hash;
216
217 hash = fib_info_hashfn(nfi);
218 head = &fib_info_hash[hash];
219
220 hlist_for_each_entry(fi, node, head, fib_hash) {
221 if (fi->fib_nhs != nfi->fib_nhs)
222 continue;
223 if (nfi->fib_protocol == fi->fib_protocol &&
224 nfi->fib_prefsrc == fi->fib_prefsrc &&
225 nfi->fib_priority == fi->fib_priority &&
226 memcmp(nfi->fib_metrics, fi->fib_metrics,
227 sizeof(fi->fib_metrics)) == 0 &&
228 ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
229 (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
230 return fi;
231 }
232
233 return NULL;
234}
235
236static inline unsigned int fib_devindex_hashfn(unsigned int val)
237{
238 unsigned int mask = DEVINDEX_HASHSIZE - 1;
239
240 return (val ^
241 (val >> DEVINDEX_HASHBITS) ^
242 (val >> (DEVINDEX_HASHBITS * 2))) & mask;
243}
244
245/* Check, that the gateway is already configured.
246 Used only by redirect accept routine.
247 */
248
249int ip_fib_check_default(u32 gw, struct net_device *dev)
250{
251 struct hlist_head *head;
252 struct hlist_node *node;
253 struct fib_nh *nh;
254 unsigned int hash;
255
256 read_lock(&fib_info_lock);
257
258 hash = fib_devindex_hashfn(dev->ifindex);
259 head = &fib_info_devhash[hash];
260 hlist_for_each_entry(nh, node, head, nh_hash) {
261 if (nh->nh_dev == dev &&
262 nh->nh_gw == gw &&
263 !(nh->nh_flags&RTNH_F_DEAD)) {
264 read_unlock(&fib_info_lock);
265 return 0;
266 }
267 }
268
269 read_unlock(&fib_info_lock);
270
271 return -1;
272}
273
274void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
275 int z, int tb_id,
276 struct nlmsghdr *n, struct netlink_skb_parms *req)
277{
278 struct sk_buff *skb;
279 u32 pid = req ? req->pid : 0;
280 int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
281
282 skb = alloc_skb(size, GFP_KERNEL);
283 if (!skb)
284 return;
285
286 if (fib_dump_info(skb, pid, n->nlmsg_seq, event, tb_id,
287 fa->fa_type, fa->fa_scope, &key, z,
288 fa->fa_tos,
289 fa->fa_info) < 0) {
290 kfree_skb(skb);
291 return;
292 }
293 NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_ROUTE;
294 if (n->nlmsg_flags&NLM_F_ECHO)
295 atomic_inc(&skb->users);
296 netlink_broadcast(rtnl, skb, pid, RTMGRP_IPV4_ROUTE, GFP_KERNEL);
297 if (n->nlmsg_flags&NLM_F_ECHO)
298 netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
299}
300
301/* Return the first fib alias matching TOS with
302 * priority less than or equal to PRIO.
303 */
304struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
305{
306 if (fah) {
307 struct fib_alias *fa;
308 list_for_each_entry(fa, fah, fa_list) {
309 if (fa->fa_tos > tos)
310 continue;
311 if (fa->fa_info->fib_priority >= prio ||
312 fa->fa_tos < tos)
313 return fa;
314 }
315 }
316 return NULL;
317}
318
319int fib_detect_death(struct fib_info *fi, int order,
320 struct fib_info **last_resort, int *last_idx, int *dflt)
321{
322 struct neighbour *n;
323 int state = NUD_NONE;
324
325 n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
326 if (n) {
327 state = n->nud_state;
328 neigh_release(n);
329 }
330 if (state==NUD_REACHABLE)
331 return 0;
332 if ((state&NUD_VALID) && order != *dflt)
333 return 0;
334 if ((state&NUD_VALID) ||
335 (*last_idx<0 && order > *dflt)) {
336 *last_resort = fi;
337 *last_idx = order;
338 }
339 return 1;
340}
341
342#ifdef CONFIG_IP_ROUTE_MULTIPATH
343
344static u32 fib_get_attr32(struct rtattr *attr, int attrlen, int type)
345{
346 while (RTA_OK(attr,attrlen)) {
347 if (attr->rta_type == type)
348 return *(u32*)RTA_DATA(attr);
349 attr = RTA_NEXT(attr, attrlen);
350 }
351 return 0;
352}
353
354static int
355fib_count_nexthops(struct rtattr *rta)
356{
357 int nhs = 0;
358 struct rtnexthop *nhp = RTA_DATA(rta);
359 int nhlen = RTA_PAYLOAD(rta);
360
361 while (nhlen >= (int)sizeof(struct rtnexthop)) {
362 if ((nhlen -= nhp->rtnh_len) < 0)
363 return 0;
364 nhs++;
365 nhp = RTNH_NEXT(nhp);
366 };
367 return nhs;
368}
369
370static int
371fib_get_nhs(struct fib_info *fi, const struct rtattr *rta, const struct rtmsg *r)
372{
373 struct rtnexthop *nhp = RTA_DATA(rta);
374 int nhlen = RTA_PAYLOAD(rta);
375
376 change_nexthops(fi) {
377 int attrlen = nhlen - sizeof(struct rtnexthop);
378 if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
379 return -EINVAL;
380 nh->nh_flags = (r->rtm_flags&~0xFF) | nhp->rtnh_flags;
381 nh->nh_oif = nhp->rtnh_ifindex;
382 nh->nh_weight = nhp->rtnh_hops + 1;
383 if (attrlen) {
384 nh->nh_gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
385#ifdef CONFIG_NET_CLS_ROUTE
386 nh->nh_tclassid = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW);
387#endif
388 }
389 nhp = RTNH_NEXT(nhp);
390 } endfor_nexthops(fi);
391 return 0;
392}
393
394#endif
395
396int fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct kern_rta *rta,
397 struct fib_info *fi)
398{
399#ifdef CONFIG_IP_ROUTE_MULTIPATH
400 struct rtnexthop *nhp;
401 int nhlen;
402#endif
403
404 if (rta->rta_priority &&
405 *rta->rta_priority != fi->fib_priority)
406 return 1;
407
408 if (rta->rta_oif || rta->rta_gw) {
409 if ((!rta->rta_oif || *rta->rta_oif == fi->fib_nh->nh_oif) &&
410 (!rta->rta_gw || memcmp(rta->rta_gw, &fi->fib_nh->nh_gw, 4) == 0))
411 return 0;
412 return 1;
413 }
414
415#ifdef CONFIG_IP_ROUTE_MULTIPATH
416 if (rta->rta_mp == NULL)
417 return 0;
418 nhp = RTA_DATA(rta->rta_mp);
419 nhlen = RTA_PAYLOAD(rta->rta_mp);
420
421 for_nexthops(fi) {
422 int attrlen = nhlen - sizeof(struct rtnexthop);
423 u32 gw;
424
425 if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
426 return -EINVAL;
427 if (nhp->rtnh_ifindex && nhp->rtnh_ifindex != nh->nh_oif)
428 return 1;
429 if (attrlen) {
430 gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
431 if (gw && gw != nh->nh_gw)
432 return 1;
433#ifdef CONFIG_NET_CLS_ROUTE
434 gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW);
435 if (gw && gw != nh->nh_tclassid)
436 return 1;
437#endif
438 }
439 nhp = RTNH_NEXT(nhp);
440 } endfor_nexthops(fi);
441#endif
442 return 0;
443}
444
445
446/*
447 Picture
448 -------
449
450 Semantics of nexthop is very messy by historical reasons.
451 We have to take into account, that:
452 a) gateway can be actually local interface address,
453 so that gatewayed route is direct.
454 b) gateway must be on-link address, possibly
455 described not by an ifaddr, but also by a direct route.
456 c) If both gateway and interface are specified, they should not
457 contradict.
458 d) If we use tunnel routes, gateway could be not on-link.
459
460 Attempt to reconcile all of these (alas, self-contradictory) conditions
461 results in pretty ugly and hairy code with obscure logic.
462
463 I chose to generalized it instead, so that the size
464 of code does not increase practically, but it becomes
465 much more general.
466 Every prefix is assigned a "scope" value: "host" is local address,
467 "link" is direct route,
468 [ ... "site" ... "interior" ... ]
469 and "universe" is true gateway route with global meaning.
470
471 Every prefix refers to a set of "nexthop"s (gw, oif),
472 where gw must have narrower scope. This recursion stops
473 when gw has LOCAL scope or if "nexthop" is declared ONLINK,
474 which means that gw is forced to be on link.
475
476 Code is still hairy, but now it is apparently logically
477 consistent and very flexible. F.e. as by-product it allows
478 to co-exists in peace independent exterior and interior
479 routing processes.
480
481 Normally it looks as following.
482
483 {universe prefix} -> (gw, oif) [scope link]
484 |
485 |-> {link prefix} -> (gw, oif) [scope local]
486 |
487 |-> {local prefix} (terminal node)
488 */
489
490static int fib_check_nh(const struct rtmsg *r, struct fib_info *fi, struct fib_nh *nh)
491{
492 int err;
493
494 if (nh->nh_gw) {
495 struct fib_result res;
496
497#ifdef CONFIG_IP_ROUTE_PERVASIVE
498 if (nh->nh_flags&RTNH_F_PERVASIVE)
499 return 0;
500#endif
501 if (nh->nh_flags&RTNH_F_ONLINK) {
502 struct net_device *dev;
503
504 if (r->rtm_scope >= RT_SCOPE_LINK)
505 return -EINVAL;
506 if (inet_addr_type(nh->nh_gw) != RTN_UNICAST)
507 return -EINVAL;
508 if ((dev = __dev_get_by_index(nh->nh_oif)) == NULL)
509 return -ENODEV;
510 if (!(dev->flags&IFF_UP))
511 return -ENETDOWN;
512 nh->nh_dev = dev;
513 dev_hold(dev);
514 nh->nh_scope = RT_SCOPE_LINK;
515 return 0;
516 }
517 {
518 struct flowi fl = { .nl_u = { .ip4_u =
519 { .daddr = nh->nh_gw,
520 .scope = r->rtm_scope + 1 } },
521 .oif = nh->nh_oif };
522
523 /* It is not necessary, but requires a bit of thinking */
524 if (fl.fl4_scope < RT_SCOPE_LINK)
525 fl.fl4_scope = RT_SCOPE_LINK;
526 if ((err = fib_lookup(&fl, &res)) != 0)
527 return err;
528 }
529 err = -EINVAL;
530 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
531 goto out;
532 nh->nh_scope = res.scope;
533 nh->nh_oif = FIB_RES_OIF(res);
534 if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
535 goto out;
536 dev_hold(nh->nh_dev);
537 err = -ENETDOWN;
538 if (!(nh->nh_dev->flags & IFF_UP))
539 goto out;
540 err = 0;
541out:
542 fib_res_put(&res);
543 return err;
544 } else {
545 struct in_device *in_dev;
546
547 if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
548 return -EINVAL;
549
550 in_dev = inetdev_by_index(nh->nh_oif);
551 if (in_dev == NULL)
552 return -ENODEV;
553 if (!(in_dev->dev->flags&IFF_UP)) {
554 in_dev_put(in_dev);
555 return -ENETDOWN;
556 }
557 nh->nh_dev = in_dev->dev;
558 dev_hold(nh->nh_dev);
559 nh->nh_scope = RT_SCOPE_HOST;
560 in_dev_put(in_dev);
561 }
562 return 0;
563}
564
565static inline unsigned int fib_laddr_hashfn(u32 val)
566{
567 unsigned int mask = (fib_hash_size - 1);
568
569 return (val ^ (val >> 7) ^ (val >> 14)) & mask;
570}
571
572static struct hlist_head *fib_hash_alloc(int bytes)
573{
574 if (bytes <= PAGE_SIZE)
575 return kmalloc(bytes, GFP_KERNEL);
576 else
577 return (struct hlist_head *)
578 __get_free_pages(GFP_KERNEL, get_order(bytes));
579}
580
581static void fib_hash_free(struct hlist_head *hash, int bytes)
582{
583 if (!hash)
584 return;
585
586 if (bytes <= PAGE_SIZE)
587 kfree(hash);
588 else
589 free_pages((unsigned long) hash, get_order(bytes));
590}
591
592static void fib_hash_move(struct hlist_head *new_info_hash,
593 struct hlist_head *new_laddrhash,
594 unsigned int new_size)
595{
596 unsigned int old_size = fib_hash_size;
597 unsigned int i;
598
599 write_lock(&fib_info_lock);
600 fib_hash_size = new_size;
601
602 for (i = 0; i < old_size; i++) {
603 struct hlist_head *head = &fib_info_hash[i];
604 struct hlist_node *node, *n;
605 struct fib_info *fi;
606
607 hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
608 struct hlist_head *dest;
609 unsigned int new_hash;
610
611 hlist_del(&fi->fib_hash);
612
613 new_hash = fib_info_hashfn(fi);
614 dest = &new_info_hash[new_hash];
615 hlist_add_head(&fi->fib_hash, dest);
616 }
617 }
618 fib_info_hash = new_info_hash;
619
620 for (i = 0; i < old_size; i++) {
621 struct hlist_head *lhead = &fib_info_laddrhash[i];
622 struct hlist_node *node, *n;
623 struct fib_info *fi;
624
625 hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
626 struct hlist_head *ldest;
627 unsigned int new_hash;
628
629 hlist_del(&fi->fib_lhash);
630
631 new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
632 ldest = &new_laddrhash[new_hash];
633 hlist_add_head(&fi->fib_lhash, ldest);
634 }
635 }
636 fib_info_laddrhash = new_laddrhash;
637
638 write_unlock(&fib_info_lock);
639}
640
641struct fib_info *
642fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
643 const struct nlmsghdr *nlh, int *errp)
644{
645 int err;
646 struct fib_info *fi = NULL;
647 struct fib_info *ofi;
648#ifdef CONFIG_IP_ROUTE_MULTIPATH
649 int nhs = 1;
650#else
651 const int nhs = 1;
652#endif
653#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
654 u32 mp_alg = IP_MP_ALG_NONE;
655#endif
656
657 /* Fast check to catch the most weird cases */
658 if (fib_props[r->rtm_type].scope > r->rtm_scope)
659 goto err_inval;
660
661#ifdef CONFIG_IP_ROUTE_MULTIPATH
662 if (rta->rta_mp) {
663 nhs = fib_count_nexthops(rta->rta_mp);
664 if (nhs == 0)
665 goto err_inval;
666 }
667#endif
668#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
669 if (rta->rta_mp_alg) {
670 mp_alg = *rta->rta_mp_alg;
671
672 if (mp_alg < IP_MP_ALG_NONE ||
673 mp_alg > IP_MP_ALG_MAX)
674 goto err_inval;
675 }
676#endif
677
678 err = -ENOBUFS;
679 if (fib_info_cnt >= fib_hash_size) {
680 unsigned int new_size = fib_hash_size << 1;
681 struct hlist_head *new_info_hash;
682 struct hlist_head *new_laddrhash;
683 unsigned int bytes;
684
685 if (!new_size)
686 new_size = 1;
687 bytes = new_size * sizeof(struct hlist_head *);
688 new_info_hash = fib_hash_alloc(bytes);
689 new_laddrhash = fib_hash_alloc(bytes);
690 if (!new_info_hash || !new_laddrhash) {
691 fib_hash_free(new_info_hash, bytes);
692 fib_hash_free(new_laddrhash, bytes);
693 } else {
694 memset(new_info_hash, 0, bytes);
695 memset(new_laddrhash, 0, bytes);
696
697 fib_hash_move(new_info_hash, new_laddrhash, new_size);
698 }
699
700 if (!fib_hash_size)
701 goto failure;
702 }
703
704 fi = kmalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
705 if (fi == NULL)
706 goto failure;
707 fib_info_cnt++;
708 memset(fi, 0, sizeof(*fi)+nhs*sizeof(struct fib_nh));
709
710 fi->fib_protocol = r->rtm_protocol;
711
712 fi->fib_nhs = nhs;
713 change_nexthops(fi) {
714 nh->nh_parent = fi;
715 } endfor_nexthops(fi)
716
717 fi->fib_flags = r->rtm_flags;
718 if (rta->rta_priority)
719 fi->fib_priority = *rta->rta_priority;
720 if (rta->rta_mx) {
721 int attrlen = RTA_PAYLOAD(rta->rta_mx);
722 struct rtattr *attr = RTA_DATA(rta->rta_mx);
723
724 while (RTA_OK(attr, attrlen)) {
725 unsigned flavor = attr->rta_type;
726 if (flavor) {
727 if (flavor > RTAX_MAX)
728 goto err_inval;
729 fi->fib_metrics[flavor-1] = *(unsigned*)RTA_DATA(attr);
730 }
731 attr = RTA_NEXT(attr, attrlen);
732 }
733 }
734 if (rta->rta_prefsrc)
735 memcpy(&fi->fib_prefsrc, rta->rta_prefsrc, 4);
736
737 if (rta->rta_mp) {
738#ifdef CONFIG_IP_ROUTE_MULTIPATH
739 if ((err = fib_get_nhs(fi, rta->rta_mp, r)) != 0)
740 goto failure;
741 if (rta->rta_oif && fi->fib_nh->nh_oif != *rta->rta_oif)
742 goto err_inval;
743 if (rta->rta_gw && memcmp(&fi->fib_nh->nh_gw, rta->rta_gw, 4))
744 goto err_inval;
745#ifdef CONFIG_NET_CLS_ROUTE
746 if (rta->rta_flow && memcmp(&fi->fib_nh->nh_tclassid, rta->rta_flow, 4))
747 goto err_inval;
748#endif
749#else
750 goto err_inval;
751#endif
752 } else {
753 struct fib_nh *nh = fi->fib_nh;
754 if (rta->rta_oif)
755 nh->nh_oif = *rta->rta_oif;
756 if (rta->rta_gw)
757 memcpy(&nh->nh_gw, rta->rta_gw, 4);
758#ifdef CONFIG_NET_CLS_ROUTE
759 if (rta->rta_flow)
760 memcpy(&nh->nh_tclassid, rta->rta_flow, 4);
761#endif
762 nh->nh_flags = r->rtm_flags;
763#ifdef CONFIG_IP_ROUTE_MULTIPATH
764 nh->nh_weight = 1;
765#endif
766 }
767
768#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
769 fi->fib_mp_alg = mp_alg;
770#endif
771
772 if (fib_props[r->rtm_type].error) {
773 if (rta->rta_gw || rta->rta_oif || rta->rta_mp)
774 goto err_inval;
775 goto link_it;
776 }
777
778 if (r->rtm_scope > RT_SCOPE_HOST)
779 goto err_inval;
780
781 if (r->rtm_scope == RT_SCOPE_HOST) {
782 struct fib_nh *nh = fi->fib_nh;
783
784 /* Local address is added. */
785 if (nhs != 1 || nh->nh_gw)
786 goto err_inval;
787 nh->nh_scope = RT_SCOPE_NOWHERE;
788 nh->nh_dev = dev_get_by_index(fi->fib_nh->nh_oif);
789 err = -ENODEV;
790 if (nh->nh_dev == NULL)
791 goto failure;
792 } else {
793 change_nexthops(fi) {
794 if ((err = fib_check_nh(r, fi, nh)) != 0)
795 goto failure;
796 } endfor_nexthops(fi)
797 }
798
799 if (fi->fib_prefsrc) {
800 if (r->rtm_type != RTN_LOCAL || rta->rta_dst == NULL ||
801 memcmp(&fi->fib_prefsrc, rta->rta_dst, 4))
802 if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL)
803 goto err_inval;
804 }
805
806link_it:
807 if ((ofi = fib_find_info(fi)) != NULL) {
808 fi->fib_dead = 1;
809 free_fib_info(fi);
810 ofi->fib_treeref++;
811 return ofi;
812 }
813
814 fi->fib_treeref++;
815 atomic_inc(&fi->fib_clntref);
816 write_lock(&fib_info_lock);
817 hlist_add_head(&fi->fib_hash,
818 &fib_info_hash[fib_info_hashfn(fi)]);
819 if (fi->fib_prefsrc) {
820 struct hlist_head *head;
821
822 head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
823 hlist_add_head(&fi->fib_lhash, head);
824 }
825 change_nexthops(fi) {
826 struct hlist_head *head;
827 unsigned int hash;
828
829 if (!nh->nh_dev)
830 continue;
831 hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
832 head = &fib_info_devhash[hash];
833 hlist_add_head(&nh->nh_hash, head);
834 } endfor_nexthops(fi)
835 write_unlock(&fib_info_lock);
836 return fi;
837
838err_inval:
839 err = -EINVAL;
840
841failure:
842 *errp = err;
843 if (fi) {
844 fi->fib_dead = 1;
845 free_fib_info(fi);
846 }
847 return NULL;
848}
849
850int fib_semantic_match(struct list_head *head, const struct flowi *flp,
851 struct fib_result *res, __u32 zone, __u32 mask,
852 int prefixlen)
853{
854 struct fib_alias *fa;
855 int nh_sel = 0;
856
857 list_for_each_entry(fa, head, fa_list) {
858 int err;
859
860 if (fa->fa_tos &&
861 fa->fa_tos != flp->fl4_tos)
862 continue;
863
864 if (fa->fa_scope < flp->fl4_scope)
865 continue;
866
867 fa->fa_state |= FA_S_ACCESSED;
868
869 err = fib_props[fa->fa_type].error;
870 if (err == 0) {
871 struct fib_info *fi = fa->fa_info;
872
873 if (fi->fib_flags & RTNH_F_DEAD)
874 continue;
875
876 switch (fa->fa_type) {
877 case RTN_UNICAST:
878 case RTN_LOCAL:
879 case RTN_BROADCAST:
880 case RTN_ANYCAST:
881 case RTN_MULTICAST:
882 for_nexthops(fi) {
883 if (nh->nh_flags&RTNH_F_DEAD)
884 continue;
885 if (!flp->oif || flp->oif == nh->nh_oif)
886 break;
887 }
888#ifdef CONFIG_IP_ROUTE_MULTIPATH
889 if (nhsel < fi->fib_nhs) {
890 nh_sel = nhsel;
891 goto out_fill_res;
892 }
893#else
894 if (nhsel < 1) {
895 goto out_fill_res;
896 }
897#endif
898 endfor_nexthops(fi);
899 continue;
900
901 default:
902 printk(KERN_DEBUG "impossible 102\n");
903 return -EINVAL;
904 };
905 }
906 return err;
907 }
908 return 1;
909
910out_fill_res:
911 res->prefixlen = prefixlen;
912 res->nh_sel = nh_sel;
913 res->type = fa->fa_type;
914 res->scope = fa->fa_scope;
915 res->fi = fa->fa_info;
916#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
917 res->netmask = mask;
918 res->network = zone &
919 (0xFFFFFFFF >> (32 - prefixlen));
920#endif
921 atomic_inc(&res->fi->fib_clntref);
922 return 0;
923}
924
925/* Find appropriate source address to this destination */
926
927u32 __fib_res_prefsrc(struct fib_result *res)
928{
929 return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
930}
931
932int
933fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
934 u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos,
935 struct fib_info *fi)
936{
937 struct rtmsg *rtm;
938 struct nlmsghdr *nlh;
939 unsigned char *b = skb->tail;
940
941 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*rtm));
942 rtm = NLMSG_DATA(nlh);
943 rtm->rtm_family = AF_INET;
944 rtm->rtm_dst_len = dst_len;
945 rtm->rtm_src_len = 0;
946 rtm->rtm_tos = tos;
947 rtm->rtm_table = tb_id;
948 rtm->rtm_type = type;
949 rtm->rtm_flags = fi->fib_flags;
950 rtm->rtm_scope = scope;
951 if (rtm->rtm_dst_len)
952 RTA_PUT(skb, RTA_DST, 4, dst);
953 rtm->rtm_protocol = fi->fib_protocol;
954 if (fi->fib_priority)
955 RTA_PUT(skb, RTA_PRIORITY, 4, &fi->fib_priority);
956#ifdef CONFIG_NET_CLS_ROUTE
957 if (fi->fib_nh[0].nh_tclassid)
958 RTA_PUT(skb, RTA_FLOW, 4, &fi->fib_nh[0].nh_tclassid);
959#endif
960 if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
961 goto rtattr_failure;
962 if (fi->fib_prefsrc)
963 RTA_PUT(skb, RTA_PREFSRC, 4, &fi->fib_prefsrc);
964 if (fi->fib_nhs == 1) {
965 if (fi->fib_nh->nh_gw)
966 RTA_PUT(skb, RTA_GATEWAY, 4, &fi->fib_nh->nh_gw);
967 if (fi->fib_nh->nh_oif)
968 RTA_PUT(skb, RTA_OIF, sizeof(int), &fi->fib_nh->nh_oif);
969 }
970#ifdef CONFIG_IP_ROUTE_MULTIPATH
971 if (fi->fib_nhs > 1) {
972 struct rtnexthop *nhp;
973 struct rtattr *mp_head;
974 if (skb_tailroom(skb) <= RTA_SPACE(0))
975 goto rtattr_failure;
976 mp_head = (struct rtattr*)skb_put(skb, RTA_SPACE(0));
977
978 for_nexthops(fi) {
979 if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
980 goto rtattr_failure;
981 nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
982 nhp->rtnh_flags = nh->nh_flags & 0xFF;
983 nhp->rtnh_hops = nh->nh_weight-1;
984 nhp->rtnh_ifindex = nh->nh_oif;
985 if (nh->nh_gw)
986 RTA_PUT(skb, RTA_GATEWAY, 4, &nh->nh_gw);
987 nhp->rtnh_len = skb->tail - (unsigned char*)nhp;
988 } endfor_nexthops(fi);
989 mp_head->rta_type = RTA_MULTIPATH;
990 mp_head->rta_len = skb->tail - (u8*)mp_head;
991 }
992#endif
993 nlh->nlmsg_len = skb->tail - b;
994 return skb->len;
995
996nlmsg_failure:
997rtattr_failure:
998 skb_trim(skb, b - skb->data);
999 return -1;
1000}
1001
1002#ifndef CONFIG_IP_NOSIOCRT
1003
1004int
1005fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
1006 struct kern_rta *rta, struct rtentry *r)
1007{
1008 int plen;
1009 u32 *ptr;
1010
1011 memset(rtm, 0, sizeof(*rtm));
1012 memset(rta, 0, sizeof(*rta));
1013
1014 if (r->rt_dst.sa_family != AF_INET)
1015 return -EAFNOSUPPORT;
1016
1017 /* Check mask for validity:
1018 a) it must be contiguous.
1019 b) destination must have all host bits clear.
1020 c) if application forgot to set correct family (AF_INET),
1021 reject request unless it is absolutely clear i.e.
1022 both family and mask are zero.
1023 */
1024 plen = 32;
1025 ptr = &((struct sockaddr_in*)&r->rt_dst)->sin_addr.s_addr;
1026 if (!(r->rt_flags&RTF_HOST)) {
1027 u32 mask = ((struct sockaddr_in*)&r->rt_genmask)->sin_addr.s_addr;
1028 if (r->rt_genmask.sa_family != AF_INET) {
1029 if (mask || r->rt_genmask.sa_family)
1030 return -EAFNOSUPPORT;
1031 }
1032 if (bad_mask(mask, *ptr))
1033 return -EINVAL;
1034 plen = inet_mask_len(mask);
1035 }
1036
1037 nl->nlmsg_flags = NLM_F_REQUEST;
1038 nl->nlmsg_pid = 0;
1039 nl->nlmsg_seq = 0;
1040 nl->nlmsg_len = NLMSG_LENGTH(sizeof(*rtm));
1041 if (cmd == SIOCDELRT) {
1042 nl->nlmsg_type = RTM_DELROUTE;
1043 nl->nlmsg_flags = 0;
1044 } else {
1045 nl->nlmsg_type = RTM_NEWROUTE;
1046 nl->nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE;
1047 rtm->rtm_protocol = RTPROT_BOOT;
1048 }
1049
1050 rtm->rtm_dst_len = plen;
1051 rta->rta_dst = ptr;
1052
1053 if (r->rt_metric) {
1054 *(u32*)&r->rt_pad3 = r->rt_metric - 1;
1055 rta->rta_priority = (u32*)&r->rt_pad3;
1056 }
1057 if (r->rt_flags&RTF_REJECT) {
1058 rtm->rtm_scope = RT_SCOPE_HOST;
1059 rtm->rtm_type = RTN_UNREACHABLE;
1060 return 0;
1061 }
1062 rtm->rtm_scope = RT_SCOPE_NOWHERE;
1063 rtm->rtm_type = RTN_UNICAST;
1064
1065 if (r->rt_dev) {
1066 char *colon;
1067 struct net_device *dev;
1068 char devname[IFNAMSIZ];
1069
1070 if (copy_from_user(devname, r->rt_dev, IFNAMSIZ-1))
1071 return -EFAULT;
1072 devname[IFNAMSIZ-1] = 0;
1073 colon = strchr(devname, ':');
1074 if (colon)
1075 *colon = 0;
1076 dev = __dev_get_by_name(devname);
1077 if (!dev)
1078 return -ENODEV;
1079 rta->rta_oif = &dev->ifindex;
1080 if (colon) {
1081 struct in_ifaddr *ifa;
1082 struct in_device *in_dev = __in_dev_get(dev);
1083 if (!in_dev)
1084 return -ENODEV;
1085 *colon = ':';
1086 for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
1087 if (strcmp(ifa->ifa_label, devname) == 0)
1088 break;
1089 if (ifa == NULL)
1090 return -ENODEV;
1091 rta->rta_prefsrc = &ifa->ifa_local;
1092 }
1093 }
1094
1095 ptr = &((struct sockaddr_in*)&r->rt_gateway)->sin_addr.s_addr;
1096 if (r->rt_gateway.sa_family == AF_INET && *ptr) {
1097 rta->rta_gw = ptr;
1098 if (r->rt_flags&RTF_GATEWAY && inet_addr_type(*ptr) == RTN_UNICAST)
1099 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1100 }
1101
1102 if (cmd == SIOCDELRT)
1103 return 0;
1104
1105 if (r->rt_flags&RTF_GATEWAY && rta->rta_gw == NULL)
1106 return -EINVAL;
1107
1108 if (rtm->rtm_scope == RT_SCOPE_NOWHERE)
1109 rtm->rtm_scope = RT_SCOPE_LINK;
1110
1111 if (r->rt_flags&(RTF_MTU|RTF_WINDOW|RTF_IRTT)) {
1112 struct rtattr *rec;
1113 struct rtattr *mx = kmalloc(RTA_LENGTH(3*RTA_LENGTH(4)), GFP_KERNEL);
1114 if (mx == NULL)
1115 return -ENOMEM;
1116 rta->rta_mx = mx;
1117 mx->rta_type = RTA_METRICS;
1118 mx->rta_len = RTA_LENGTH(0);
1119 if (r->rt_flags&RTF_MTU) {
1120 rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
1121 rec->rta_type = RTAX_ADVMSS;
1122 rec->rta_len = RTA_LENGTH(4);
1123 mx->rta_len += RTA_LENGTH(4);
1124 *(u32*)RTA_DATA(rec) = r->rt_mtu - 40;
1125 }
1126 if (r->rt_flags&RTF_WINDOW) {
1127 rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
1128 rec->rta_type = RTAX_WINDOW;
1129 rec->rta_len = RTA_LENGTH(4);
1130 mx->rta_len += RTA_LENGTH(4);
1131 *(u32*)RTA_DATA(rec) = r->rt_window;
1132 }
1133 if (r->rt_flags&RTF_IRTT) {
1134 rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
1135 rec->rta_type = RTAX_RTT;
1136 rec->rta_len = RTA_LENGTH(4);
1137 mx->rta_len += RTA_LENGTH(4);
1138 *(u32*)RTA_DATA(rec) = r->rt_irtt<<3;
1139 }
1140 }
1141 return 0;
1142}
1143
1144#endif
1145
1146/*
1147 Update FIB if:
1148 - local address disappeared -> we must delete all the entries
1149 referring to it.
1150 - device went down -> we must shutdown all nexthops going via it.
1151 */
1152
1153int fib_sync_down(u32 local, struct net_device *dev, int force)
1154{
1155 int ret = 0;
1156 int scope = RT_SCOPE_NOWHERE;
1157
1158 if (force)
1159 scope = -1;
1160
1161 if (local && fib_info_laddrhash) {
1162 unsigned int hash = fib_laddr_hashfn(local);
1163 struct hlist_head *head = &fib_info_laddrhash[hash];
1164 struct hlist_node *node;
1165 struct fib_info *fi;
1166
1167 hlist_for_each_entry(fi, node, head, fib_lhash) {
1168 if (fi->fib_prefsrc == local) {
1169 fi->fib_flags |= RTNH_F_DEAD;
1170 ret++;
1171 }
1172 }
1173 }
1174
1175 if (dev) {
1176 struct fib_info *prev_fi = NULL;
1177 unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1178 struct hlist_head *head = &fib_info_devhash[hash];
1179 struct hlist_node *node;
1180 struct fib_nh *nh;
1181
1182 hlist_for_each_entry(nh, node, head, nh_hash) {
1183 struct fib_info *fi = nh->nh_parent;
1184 int dead;
1185
1186 BUG_ON(!fi->fib_nhs);
1187 if (nh->nh_dev != dev || fi == prev_fi)
1188 continue;
1189 prev_fi = fi;
1190 dead = 0;
1191 change_nexthops(fi) {
1192 if (nh->nh_flags&RTNH_F_DEAD)
1193 dead++;
1194 else if (nh->nh_dev == dev &&
1195 nh->nh_scope != scope) {
1196 nh->nh_flags |= RTNH_F_DEAD;
1197#ifdef CONFIG_IP_ROUTE_MULTIPATH
1198 spin_lock_bh(&fib_multipath_lock);
1199 fi->fib_power -= nh->nh_power;
1200 nh->nh_power = 0;
1201 spin_unlock_bh(&fib_multipath_lock);
1202#endif
1203 dead++;
1204 }
1205#ifdef CONFIG_IP_ROUTE_MULTIPATH
1206 if (force > 1 && nh->nh_dev == dev) {
1207 dead = fi->fib_nhs;
1208 break;
1209 }
1210#endif
1211 } endfor_nexthops(fi)
1212 if (dead == fi->fib_nhs) {
1213 fi->fib_flags |= RTNH_F_DEAD;
1214 ret++;
1215 }
1216 }
1217 }
1218
1219 return ret;
1220}
1221
1222#ifdef CONFIG_IP_ROUTE_MULTIPATH
1223
1224/*
1225 Dead device goes up. We wake up dead nexthops.
1226 It takes sense only on multipath routes.
1227 */
1228
1229int fib_sync_up(struct net_device *dev)
1230{
1231 struct fib_info *prev_fi;
1232 unsigned int hash;
1233 struct hlist_head *head;
1234 struct hlist_node *node;
1235 struct fib_nh *nh;
1236 int ret;
1237
1238 if (!(dev->flags&IFF_UP))
1239 return 0;
1240
1241 prev_fi = NULL;
1242 hash = fib_devindex_hashfn(dev->ifindex);
1243 head = &fib_info_devhash[hash];
1244 ret = 0;
1245
1246 hlist_for_each_entry(nh, node, head, nh_hash) {
1247 struct fib_info *fi = nh->nh_parent;
1248 int alive;
1249
1250 BUG_ON(!fi->fib_nhs);
1251 if (nh->nh_dev != dev || fi == prev_fi)
1252 continue;
1253
1254 prev_fi = fi;
1255 alive = 0;
1256 change_nexthops(fi) {
1257 if (!(nh->nh_flags&RTNH_F_DEAD)) {
1258 alive++;
1259 continue;
1260 }
1261 if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
1262 continue;
1263 if (nh->nh_dev != dev || __in_dev_get(dev) == NULL)
1264 continue;
1265 alive++;
1266 spin_lock_bh(&fib_multipath_lock);
1267 nh->nh_power = 0;
1268 nh->nh_flags &= ~RTNH_F_DEAD;
1269 spin_unlock_bh(&fib_multipath_lock);
1270 } endfor_nexthops(fi)
1271
1272 if (alive > 0) {
1273 fi->fib_flags &= ~RTNH_F_DEAD;
1274 ret++;
1275 }
1276 }
1277
1278 return ret;
1279}
1280
1281/*
1282 The algorithm is suboptimal, but it provides really
1283 fair weighted route distribution.
1284 */
1285
1286void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1287{
1288 struct fib_info *fi = res->fi;
1289 int w;
1290
1291 spin_lock_bh(&fib_multipath_lock);
1292 if (fi->fib_power <= 0) {
1293 int power = 0;
1294 change_nexthops(fi) {
1295 if (!(nh->nh_flags&RTNH_F_DEAD)) {
1296 power += nh->nh_weight;
1297 nh->nh_power = nh->nh_weight;
1298 }
1299 } endfor_nexthops(fi);
1300 fi->fib_power = power;
1301 if (power <= 0) {
1302 spin_unlock_bh(&fib_multipath_lock);
1303 /* Race condition: route has just become dead. */
1304 res->nh_sel = 0;
1305 return;
1306 }
1307 }
1308
1309
1310 /* w should be random number [0..fi->fib_power-1],
1311 it is pretty bad approximation.
1312 */
1313
1314 w = jiffies % fi->fib_power;
1315
1316 change_nexthops(fi) {
1317 if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
1318 if ((w -= nh->nh_power) <= 0) {
1319 nh->nh_power--;
1320 fi->fib_power--;
1321 res->nh_sel = nhsel;
1322 spin_unlock_bh(&fib_multipath_lock);
1323 return;
1324 }
1325 }
1326 } endfor_nexthops(fi);
1327
1328 /* Race condition: route has just become dead. */
1329 res->nh_sel = 0;
1330 spin_unlock_bh(&fib_multipath_lock);
1331}
1332#endif