blob: b78a535868042381dbdd42454b7a1c63f16ddffa [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * IPv6 output functions
3 * Linux INET6 implementation
4 *
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
7 *
8 * $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9 *
10 * Based on linux/net/ipv4/ip_output.c
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
16 *
17 * Changes:
18 * A.N.Kuznetsov : airthmetics in fragmentation.
19 * extension headers are implemented.
20 * route changes now work.
21 * ip6_forward does not confuse sniffers.
22 * etc.
23 *
24 * H. von Brand : Added missing #include <linux/string.h>
25 * Imran Patel : frag id should be in NBO
26 * Kazunori MIYAZAWA @USAGI
27 * : add ip6_append_data and related functions
28 * for datagram xmit
29 */
30
31#include <linux/config.h>
32#include <linux/errno.h>
33#include <linux/types.h>
34#include <linux/string.h>
35#include <linux/socket.h>
36#include <linux/net.h>
37#include <linux/netdevice.h>
38#include <linux/if_arp.h>
39#include <linux/in6.h>
40#include <linux/tcp.h>
41#include <linux/route.h>
42
43#include <linux/netfilter.h>
44#include <linux/netfilter_ipv6.h>
45
46#include <net/sock.h>
47#include <net/snmp.h>
48
49#include <net/ipv6.h>
50#include <net/ndisc.h>
51#include <net/protocol.h>
52#include <net/ip6_route.h>
53#include <net/addrconf.h>
54#include <net/rawv6.h>
55#include <net/icmp.h>
56#include <net/xfrm.h>
57#include <net/checksum.h>
58
59static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62{
63 static u32 ipv6_fragmentation_id = 1;
64 static DEFINE_SPINLOCK(ip6_id_lock);
65
66 spin_lock_bh(&ip6_id_lock);
67 fhdr->identification = htonl(ipv6_fragmentation_id);
68 if (++ipv6_fragmentation_id == 0)
69 ipv6_fragmentation_id = 1;
70 spin_unlock_bh(&ip6_id_lock);
71}
72
73static inline int ip6_output_finish(struct sk_buff *skb)
74{
75
76 struct dst_entry *dst = skb->dst;
77 struct hh_cache *hh = dst->hh;
78
79 if (hh) {
80 int hh_alen;
81
82 read_lock_bh(&hh->hh_lock);
83 hh_alen = HH_DATA_ALIGN(hh->hh_len);
84 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
85 read_unlock_bh(&hh->hh_lock);
86 skb_push(skb, hh->hh_len);
87 return hh->hh_output(skb);
88 } else if (dst->neighbour)
89 return dst->neighbour->output(skb);
90
91 IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
92 kfree_skb(skb);
93 return -EINVAL;
94
95}
96
97/* dev_loopback_xmit for use with netfilter. */
98static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
99{
100 newskb->mac.raw = newskb->data;
101 __skb_pull(newskb, newskb->nh.raw - newskb->data);
102 newskb->pkt_type = PACKET_LOOPBACK;
103 newskb->ip_summed = CHECKSUM_UNNECESSARY;
104 BUG_TRAP(newskb->dst);
105
106 netif_rx(newskb);
107 return 0;
108}
109
110
111static int ip6_output2(struct sk_buff *skb)
112{
113 struct dst_entry *dst = skb->dst;
114 struct net_device *dev = dst->dev;
115
116 skb->protocol = htons(ETH_P_IPV6);
117 skb->dev = dev;
118
119 if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
120 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
121
122 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
123 ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
124 &skb->nh.ipv6h->saddr)) {
125 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
126
127 /* Do not check for IFF_ALLMULTI; multicast routing
128 is not supported in any case.
129 */
130 if (newskb)
131 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
132 newskb->dev,
133 ip6_dev_loopback_xmit);
134
135 if (skb->nh.ipv6h->hop_limit == 0) {
136 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
137 kfree_skb(skb);
138 return 0;
139 }
140 }
141
142 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
143 }
144
145 return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
146}
147
148int ip6_output(struct sk_buff *skb)
149{
150 if (skb->len > dst_mtu(skb->dst) || dst_allfrag(skb->dst))
151 return ip6_fragment(skb, ip6_output2);
152 else
153 return ip6_output2(skb);
154}
155
156#ifdef CONFIG_NETFILTER
157int ip6_route_me_harder(struct sk_buff *skb)
158{
159 struct ipv6hdr *iph = skb->nh.ipv6h;
160 struct dst_entry *dst;
161 struct flowi fl = {
162 .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
163 .nl_u =
164 { .ip6_u =
165 { .daddr = iph->daddr,
166 .saddr = iph->saddr, } },
167 .proto = iph->nexthdr,
168 };
169
170 dst = ip6_route_output(skb->sk, &fl);
171
172 if (dst->error) {
173 IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
174 LIMIT_NETDEBUG(
175 printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n"));
176 dst_release(dst);
177 return -EINVAL;
178 }
179
180 /* Drop old route. */
181 dst_release(skb->dst);
182
183 skb->dst = dst;
184 return 0;
185}
186#endif
187
188static inline int ip6_maybe_reroute(struct sk_buff *skb)
189{
190#ifdef CONFIG_NETFILTER
191 if (skb->nfcache & NFC_ALTERED){
192 if (ip6_route_me_harder(skb) != 0){
193 kfree_skb(skb);
194 return -EINVAL;
195 }
196 }
197#endif /* CONFIG_NETFILTER */
198 return dst_output(skb);
199}
200
201/*
202 * xmit an sk_buff (used by TCP)
203 */
204
205int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
206 struct ipv6_txoptions *opt, int ipfragok)
207{
208 struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
209 struct in6_addr *first_hop = &fl->fl6_dst;
210 struct dst_entry *dst = skb->dst;
211 struct ipv6hdr *hdr;
212 u8 proto = fl->proto;
213 int seg_len = skb->len;
214 int hlimit;
215 u32 mtu;
216
217 if (opt) {
218 int head_room;
219
220 /* First: exthdrs may take lots of space (~8K for now)
221 MAX_HEADER is not enough.
222 */
223 head_room = opt->opt_nflen + opt->opt_flen;
224 seg_len += head_room;
225 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
226
227 if (skb_headroom(skb) < head_room) {
228 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
229 kfree_skb(skb);
230 skb = skb2;
231 if (skb == NULL) {
232 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
233 return -ENOBUFS;
234 }
235 if (sk)
236 skb_set_owner_w(skb, sk);
237 }
238 if (opt->opt_flen)
239 ipv6_push_frag_opts(skb, opt, &proto);
240 if (opt->opt_nflen)
241 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
242 }
243
244 hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
245
246 /*
247 * Fill in the IPv6 header
248 */
249
250 *(u32*)hdr = htonl(0x60000000) | fl->fl6_flowlabel;
251 hlimit = -1;
252 if (np)
253 hlimit = np->hop_limit;
254 if (hlimit < 0)
255 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
256 if (hlimit < 0)
257 hlimit = ipv6_get_hoplimit(dst->dev);
258
259 hdr->payload_len = htons(seg_len);
260 hdr->nexthdr = proto;
261 hdr->hop_limit = hlimit;
262
263 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
264 ipv6_addr_copy(&hdr->daddr, first_hop);
265
266 mtu = dst_mtu(dst);
267 if ((skb->len <= mtu) || ipfragok) {
268 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
269 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute);
270 }
271
272 if (net_ratelimit())
273 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
274 skb->dev = dst->dev;
275 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
276 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
277 kfree_skb(skb);
278 return -EMSGSIZE;
279}
280
281/*
282 * To avoid extra problems ND packets are send through this
283 * routine. It's code duplication but I really want to avoid
284 * extra checks since ipv6_build_header is used by TCP (which
285 * is for us performance critical)
286 */
287
288int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
289 struct in6_addr *saddr, struct in6_addr *daddr,
290 int proto, int len)
291{
292 struct ipv6_pinfo *np = inet6_sk(sk);
293 struct ipv6hdr *hdr;
294 int totlen;
295
296 skb->protocol = htons(ETH_P_IPV6);
297 skb->dev = dev;
298
299 totlen = len + sizeof(struct ipv6hdr);
300
301 hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
302 skb->nh.ipv6h = hdr;
303
304 *(u32*)hdr = htonl(0x60000000);
305
306 hdr->payload_len = htons(len);
307 hdr->nexthdr = proto;
308 hdr->hop_limit = np->hop_limit;
309
310 ipv6_addr_copy(&hdr->saddr, saddr);
311 ipv6_addr_copy(&hdr->daddr, daddr);
312
313 return 0;
314}
315
316static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
317{
318 struct ip6_ra_chain *ra;
319 struct sock *last = NULL;
320
321 read_lock(&ip6_ra_lock);
322 for (ra = ip6_ra_chain; ra; ra = ra->next) {
323 struct sock *sk = ra->sk;
324 if (sk && ra->sel == sel) {
325 if (last) {
326 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
327 if (skb2)
328 rawv6_rcv(last, skb2);
329 }
330 last = sk;
331 }
332 }
333
334 if (last) {
335 rawv6_rcv(last, skb);
336 read_unlock(&ip6_ra_lock);
337 return 1;
338 }
339 read_unlock(&ip6_ra_lock);
340 return 0;
341}
342
343static inline int ip6_forward_finish(struct sk_buff *skb)
344{
345 return dst_output(skb);
346}
347
348int ip6_forward(struct sk_buff *skb)
349{
350 struct dst_entry *dst = skb->dst;
351 struct ipv6hdr *hdr = skb->nh.ipv6h;
352 struct inet6_skb_parm *opt = IP6CB(skb);
353
354 if (ipv6_devconf.forwarding == 0)
355 goto error;
356
357 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
358 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
359 goto drop;
360 }
361
362 skb->ip_summed = CHECKSUM_NONE;
363
364 /*
365 * We DO NOT make any processing on
366 * RA packets, pushing them to user level AS IS
367 * without ane WARRANTY that application will be able
368 * to interpret them. The reason is that we
369 * cannot make anything clever here.
370 *
371 * We are not end-node, so that if packet contains
372 * AH/ESP, we cannot make anything.
373 * Defragmentation also would be mistake, RA packets
374 * cannot be fragmented, because there is no warranty
375 * that different fragments will go along one path. --ANK
376 */
377 if (opt->ra) {
378 u8 *ptr = skb->nh.raw + opt->ra;
379 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
380 return 0;
381 }
382
383 /*
384 * check and decrement ttl
385 */
386 if (hdr->hop_limit <= 1) {
387 /* Force OUTPUT device used as source address */
388 skb->dev = dst->dev;
389 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
390 0, skb->dev);
391
392 kfree_skb(skb);
393 return -ETIMEDOUT;
394 }
395
396 if (!xfrm6_route_forward(skb)) {
397 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
398 goto drop;
399 }
400 dst = skb->dst;
401
402 /* IPv6 specs say nothing about it, but it is clear that we cannot
403 send redirects to source routed frames.
404 */
405 if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
406 struct in6_addr *target = NULL;
407 struct rt6_info *rt;
408 struct neighbour *n = dst->neighbour;
409
410 /*
411 * incoming and outgoing devices are the same
412 * send a redirect.
413 */
414
415 rt = (struct rt6_info *) dst;
416 if ((rt->rt6i_flags & RTF_GATEWAY))
417 target = (struct in6_addr*)&n->primary_key;
418 else
419 target = &hdr->daddr;
420
421 /* Limit redirects both by destination (here)
422 and by source (inside ndisc_send_redirect)
423 */
424 if (xrlim_allow(dst, 1*HZ))
425 ndisc_send_redirect(skb, n, target);
426 } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
427 |IPV6_ADDR_LINKLOCAL)) {
428 /* This check is security critical. */
429 goto error;
430 }
431
432 if (skb->len > dst_mtu(dst)) {
433 /* Again, force OUTPUT device used as source address */
434 skb->dev = dst->dev;
435 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
436 IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
437 IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
438 kfree_skb(skb);
439 return -EMSGSIZE;
440 }
441
442 if (skb_cow(skb, dst->dev->hard_header_len)) {
443 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
444 goto drop;
445 }
446
447 hdr = skb->nh.ipv6h;
448
449 /* Mangling hops number delayed to point after skb COW */
450
451 hdr->hop_limit--;
452
453 IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
454 return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
455
456error:
457 IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
458drop:
459 kfree_skb(skb);
460 return -EINVAL;
461}
462
463static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
464{
465 to->pkt_type = from->pkt_type;
466 to->priority = from->priority;
467 to->protocol = from->protocol;
468 to->security = from->security;
469 dst_release(to->dst);
470 to->dst = dst_clone(from->dst);
471 to->dev = from->dev;
472
473#ifdef CONFIG_NET_SCHED
474 to->tc_index = from->tc_index;
475#endif
476#ifdef CONFIG_NETFILTER
477 to->nfmark = from->nfmark;
478 /* Connection association is same as pre-frag packet */
479 to->nfct = from->nfct;
480 nf_conntrack_get(to->nfct);
481 to->nfctinfo = from->nfctinfo;
482#ifdef CONFIG_BRIDGE_NETFILTER
483 nf_bridge_put(to->nf_bridge);
484 to->nf_bridge = from->nf_bridge;
485 nf_bridge_get(to->nf_bridge);
486#endif
487#ifdef CONFIG_NETFILTER_DEBUG
488 to->nf_debug = from->nf_debug;
489#endif
490#endif
491}
492
493int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
494{
495 u16 offset = sizeof(struct ipv6hdr);
496 struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
497 unsigned int packet_len = skb->tail - skb->nh.raw;
498 int found_rhdr = 0;
499 *nexthdr = &skb->nh.ipv6h->nexthdr;
500
501 while (offset + 1 <= packet_len) {
502
503 switch (**nexthdr) {
504
505 case NEXTHDR_HOP:
506 case NEXTHDR_ROUTING:
507 case NEXTHDR_DEST:
508 if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
509 if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
510 offset += ipv6_optlen(exthdr);
511 *nexthdr = &exthdr->nexthdr;
512 exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
513 break;
514 default :
515 return offset;
516 }
517 }
518
519 return offset;
520}
521
522static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
523{
524 struct net_device *dev;
525 struct sk_buff *frag;
526 struct rt6_info *rt = (struct rt6_info*)skb->dst;
527 struct ipv6hdr *tmp_hdr;
528 struct frag_hdr *fh;
529 unsigned int mtu, hlen, left, len;
530 u32 frag_id = 0;
531 int ptr, offset = 0, err=0;
532 u8 *prevhdr, nexthdr = 0;
533
534 dev = rt->u.dst.dev;
535 hlen = ip6_find_1stfragopt(skb, &prevhdr);
536 nexthdr = *prevhdr;
537
538 mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
539
540 if (skb_shinfo(skb)->frag_list) {
541 int first_len = skb_pagelen(skb);
542
543 if (first_len - hlen > mtu ||
544 ((first_len - hlen) & 7) ||
545 skb_cloned(skb))
546 goto slow_path;
547
548 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
549 /* Correct geometry. */
550 if (frag->len > mtu ||
551 ((frag->len & 7) && frag->next) ||
552 skb_headroom(frag) < hlen)
553 goto slow_path;
554
Linus Torvalds1da177e2005-04-16 15:20:36 -0700555 /* Partially cloned skb? */
556 if (skb_shared(frag))
557 goto slow_path;
Herbert Xu2fdba6b2005-05-18 22:52:33 -0700558
559 BUG_ON(frag->sk);
560 if (skb->sk) {
561 sock_hold(skb->sk);
562 frag->sk = skb->sk;
563 frag->destructor = sock_wfree;
564 skb->truesize -= frag->truesize;
565 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700566 }
567
568 err = 0;
569 offset = 0;
570 frag = skb_shinfo(skb)->frag_list;
571 skb_shinfo(skb)->frag_list = NULL;
572 /* BUILD HEADER */
573
574 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
575 if (!tmp_hdr) {
576 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
577 return -ENOMEM;
578 }
579
580 *prevhdr = NEXTHDR_FRAGMENT;
581 memcpy(tmp_hdr, skb->nh.raw, hlen);
582 __skb_pull(skb, hlen);
583 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
584 skb->nh.raw = __skb_push(skb, hlen);
585 memcpy(skb->nh.raw, tmp_hdr, hlen);
586
587 ipv6_select_ident(skb, fh);
588 fh->nexthdr = nexthdr;
589 fh->reserved = 0;
590 fh->frag_off = htons(IP6_MF);
591 frag_id = fh->identification;
592
593 first_len = skb_pagelen(skb);
594 skb->data_len = first_len - skb_headlen(skb);
595 skb->len = first_len;
596 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
597
598
599 for (;;) {
600 /* Prepare header of the next frame,
601 * before previous one went down. */
602 if (frag) {
603 frag->ip_summed = CHECKSUM_NONE;
604 frag->h.raw = frag->data;
605 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
606 frag->nh.raw = __skb_push(frag, hlen);
607 memcpy(frag->nh.raw, tmp_hdr, hlen);
608 offset += skb->len - hlen - sizeof(struct frag_hdr);
609 fh->nexthdr = nexthdr;
610 fh->reserved = 0;
611 fh->frag_off = htons(offset);
612 if (frag->next != NULL)
613 fh->frag_off |= htons(IP6_MF);
614 fh->identification = frag_id;
615 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
616 ip6_copy_metadata(frag, skb);
617 }
618
619 err = output(skb);
620 if (err || !frag)
621 break;
622
623 skb = frag;
624 frag = skb->next;
625 skb->next = NULL;
626 }
627
628 if (tmp_hdr)
629 kfree(tmp_hdr);
630
631 if (err == 0) {
632 IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
633 return 0;
634 }
635
636 while (frag) {
637 skb = frag->next;
638 kfree_skb(frag);
639 frag = skb;
640 }
641
642 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
643 return err;
644 }
645
646slow_path:
647 left = skb->len - hlen; /* Space per frame */
648 ptr = hlen; /* Where to start from */
649
650 /*
651 * Fragment the datagram.
652 */
653
654 *prevhdr = NEXTHDR_FRAGMENT;
655
656 /*
657 * Keep copying data until we run out.
658 */
659 while(left > 0) {
660 len = left;
661 /* IF: it doesn't fit, use 'mtu' - the data space left */
662 if (len > mtu)
663 len = mtu;
664 /* IF: we are not sending upto and including the packet end
665 then align the next start on an eight byte boundary */
666 if (len < left) {
667 len &= ~7;
668 }
669 /*
670 * Allocate buffer.
671 */
672
673 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
674 NETDEBUG(printk(KERN_INFO "IPv6: frag: no memory for new fragment!\n"));
675 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
676 err = -ENOMEM;
677 goto fail;
678 }
679
680 /*
681 * Set up data on packet
682 */
683
684 ip6_copy_metadata(frag, skb);
685 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
686 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
687 frag->nh.raw = frag->data;
688 fh = (struct frag_hdr*)(frag->data + hlen);
689 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
690
691 /*
692 * Charge the memory for the fragment to any owner
693 * it might possess
694 */
695 if (skb->sk)
696 skb_set_owner_w(frag, skb->sk);
697
698 /*
699 * Copy the packet header into the new buffer.
700 */
701 memcpy(frag->nh.raw, skb->data, hlen);
702
703 /*
704 * Build fragment header.
705 */
706 fh->nexthdr = nexthdr;
707 fh->reserved = 0;
708 if (frag_id) {
709 ipv6_select_ident(skb, fh);
710 frag_id = fh->identification;
711 } else
712 fh->identification = frag_id;
713
714 /*
715 * Copy a block of the IP datagram.
716 */
717 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
718 BUG();
719 left -= len;
720
721 fh->frag_off = htons(offset);
722 if (left > 0)
723 fh->frag_off |= htons(IP6_MF);
724 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
725
726 ptr += len;
727 offset += len;
728
729 /*
730 * Put this fragment into the sending queue.
731 */
732
733 IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
734
735 err = output(frag);
736 if (err)
737 goto fail;
738 }
739 kfree_skb(skb);
740 IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
741 return err;
742
743fail:
744 kfree_skb(skb);
745 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
746 return err;
747}
748
749int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
750{
751 int err = 0;
752
753 *dst = NULL;
754 if (sk) {
755 struct ipv6_pinfo *np = inet6_sk(sk);
756
757 *dst = sk_dst_check(sk, np->dst_cookie);
758 if (*dst) {
759 struct rt6_info *rt = (struct rt6_info*)*dst;
760
761 /* Yes, checking route validity in not connected
762 case is not very simple. Take into account,
763 that we do not support routing by source, TOS,
764 and MSG_DONTROUTE --ANK (980726)
765
766 1. If route was host route, check that
767 cached destination is current.
768 If it is network route, we still may
769 check its validity using saved pointer
770 to the last used address: daddr_cache.
771 We do not want to save whole address now,
772 (because main consumer of this service
773 is tcp, which has not this problem),
774 so that the last trick works only on connected
775 sockets.
776 2. oif also should be the same.
777 */
778
779 if (((rt->rt6i_dst.plen != 128 ||
780 !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
781 && (np->daddr_cache == NULL ||
782 !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
783 || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
784 dst_release(*dst);
785 *dst = NULL;
786 }
787 }
788 }
789
790 if (*dst == NULL)
791 *dst = ip6_route_output(sk, fl);
792
793 if ((err = (*dst)->error))
794 goto out_err_release;
795
796 if (ipv6_addr_any(&fl->fl6_src)) {
797 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
798
799 if (err) {
800#if IP6_DEBUG >= 2
801 printk(KERN_DEBUG "ip6_dst_lookup: "
802 "no available source address\n");
803#endif
804 goto out_err_release;
805 }
806 }
807
808 return 0;
809
810out_err_release:
811 dst_release(*dst);
812 *dst = NULL;
813 return err;
814}
815
816int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb),
817 void *from, int length, int transhdrlen,
818 int hlimit, struct ipv6_txoptions *opt, struct flowi *fl, struct rt6_info *rt,
819 unsigned int flags)
820{
821 struct inet_sock *inet = inet_sk(sk);
822 struct ipv6_pinfo *np = inet6_sk(sk);
823 struct sk_buff *skb;
824 unsigned int maxfraglen, fragheaderlen;
825 int exthdrlen;
826 int hh_len;
827 int mtu;
828 int copy;
829 int err;
830 int offset = 0;
831 int csummode = CHECKSUM_NONE;
832
833 if (flags&MSG_PROBE)
834 return 0;
835 if (skb_queue_empty(&sk->sk_write_queue)) {
836 /*
837 * setup for corking
838 */
839 if (opt) {
840 if (np->cork.opt == NULL) {
841 np->cork.opt = kmalloc(opt->tot_len,
842 sk->sk_allocation);
843 if (unlikely(np->cork.opt == NULL))
844 return -ENOBUFS;
845 } else if (np->cork.opt->tot_len < opt->tot_len) {
846 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
847 return -EINVAL;
848 }
849 memcpy(np->cork.opt, opt, opt->tot_len);
850 inet->cork.flags |= IPCORK_OPT;
851 /* need source address above miyazawa*/
852 }
853 dst_hold(&rt->u.dst);
854 np->cork.rt = rt;
855 inet->cork.fl = *fl;
856 np->cork.hop_limit = hlimit;
857 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
858 if (dst_allfrag(rt->u.dst.path))
859 inet->cork.flags |= IPCORK_ALLFRAG;
860 inet->cork.length = 0;
861 sk->sk_sndmsg_page = NULL;
862 sk->sk_sndmsg_off = 0;
863 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
864 length += exthdrlen;
865 transhdrlen += exthdrlen;
866 } else {
867 rt = np->cork.rt;
868 fl = &inet->cork.fl;
869 if (inet->cork.flags & IPCORK_OPT)
870 opt = np->cork.opt;
871 transhdrlen = 0;
872 exthdrlen = 0;
873 mtu = inet->cork.fragsize;
874 }
875
876 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
877
878 fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
879 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
880
881 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
882 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
883 ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
884 return -EMSGSIZE;
885 }
886 }
887
888 /*
889 * Let's try using as much space as possible.
890 * Use MTU if total length of the message fits into the MTU.
891 * Otherwise, we need to reserve fragment header and
892 * fragment alignment (= 8-15 octects, in total).
893 *
894 * Note that we may need to "move" the data from the tail of
895 * of the buffer to the new fragment when we split
896 * the message.
897 *
898 * FIXME: It may be fragmented into multiple chunks
899 * at once if non-fragmentable extension headers
900 * are too large.
901 * --yoshfuji
902 */
903
904 inet->cork.length += length;
905
906 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
907 goto alloc_new_skb;
908
909 while (length > 0) {
910 /* Check if the remaining data fits into current packet. */
911 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
912 if (copy < length)
913 copy = maxfraglen - skb->len;
914
915 if (copy <= 0) {
916 char *data;
917 unsigned int datalen;
918 unsigned int fraglen;
919 unsigned int fraggap;
920 unsigned int alloclen;
921 struct sk_buff *skb_prev;
922alloc_new_skb:
923 skb_prev = skb;
924
925 /* There's no room in the current skb */
926 if (skb_prev)
927 fraggap = skb_prev->len - maxfraglen;
928 else
929 fraggap = 0;
930
931 /*
932 * If remaining data exceeds the mtu,
933 * we know we need more fragment(s).
934 */
935 datalen = length + fraggap;
936 if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
937 datalen = maxfraglen - fragheaderlen;
938
939 fraglen = datalen + fragheaderlen;
940 if ((flags & MSG_MORE) &&
941 !(rt->u.dst.dev->features&NETIF_F_SG))
942 alloclen = mtu;
943 else
944 alloclen = datalen + fragheaderlen;
945
946 /*
947 * The last fragment gets additional space at tail.
948 * Note: we overallocate on fragments with MSG_MODE
949 * because we have no idea if we're the last one.
950 */
951 if (datalen == length + fraggap)
952 alloclen += rt->u.dst.trailer_len;
953
954 /*
955 * We just reserve space for fragment header.
956 * Note: this may be overallocation if the message
957 * (without MSG_MORE) fits into the MTU.
958 */
959 alloclen += sizeof(struct frag_hdr);
960
961 if (transhdrlen) {
962 skb = sock_alloc_send_skb(sk,
963 alloclen + hh_len,
964 (flags & MSG_DONTWAIT), &err);
965 } else {
966 skb = NULL;
967 if (atomic_read(&sk->sk_wmem_alloc) <=
968 2 * sk->sk_sndbuf)
969 skb = sock_wmalloc(sk,
970 alloclen + hh_len, 1,
971 sk->sk_allocation);
972 if (unlikely(skb == NULL))
973 err = -ENOBUFS;
974 }
975 if (skb == NULL)
976 goto error;
977 /*
978 * Fill in the control structures
979 */
980 skb->ip_summed = csummode;
981 skb->csum = 0;
982 /* reserve for fragmentation */
983 skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
984
985 /*
986 * Find where to start putting bytes
987 */
988 data = skb_put(skb, fraglen);
989 skb->nh.raw = data + exthdrlen;
990 data += fragheaderlen;
991 skb->h.raw = data + exthdrlen;
992
993 if (fraggap) {
994 skb->csum = skb_copy_and_csum_bits(
995 skb_prev, maxfraglen,
996 data + transhdrlen, fraggap, 0);
997 skb_prev->csum = csum_sub(skb_prev->csum,
998 skb->csum);
999 data += fraggap;
1000 skb_trim(skb_prev, maxfraglen);
1001 }
1002 copy = datalen - transhdrlen - fraggap;
1003 if (copy < 0) {
1004 err = -EINVAL;
1005 kfree_skb(skb);
1006 goto error;
1007 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1008 err = -EFAULT;
1009 kfree_skb(skb);
1010 goto error;
1011 }
1012
1013 offset += copy;
1014 length -= datalen - fraggap;
1015 transhdrlen = 0;
1016 exthdrlen = 0;
1017 csummode = CHECKSUM_NONE;
1018
1019 /*
1020 * Put the packet on the pending queue
1021 */
1022 __skb_queue_tail(&sk->sk_write_queue, skb);
1023 continue;
1024 }
1025
1026 if (copy > length)
1027 copy = length;
1028
1029 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1030 unsigned int off;
1031
1032 off = skb->len;
1033 if (getfrag(from, skb_put(skb, copy),
1034 offset, copy, off, skb) < 0) {
1035 __skb_trim(skb, off);
1036 err = -EFAULT;
1037 goto error;
1038 }
1039 } else {
1040 int i = skb_shinfo(skb)->nr_frags;
1041 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1042 struct page *page = sk->sk_sndmsg_page;
1043 int off = sk->sk_sndmsg_off;
1044 unsigned int left;
1045
1046 if (page && (left = PAGE_SIZE - off) > 0) {
1047 if (copy >= left)
1048 copy = left;
1049 if (page != frag->page) {
1050 if (i == MAX_SKB_FRAGS) {
1051 err = -EMSGSIZE;
1052 goto error;
1053 }
1054 get_page(page);
1055 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1056 frag = &skb_shinfo(skb)->frags[i];
1057 }
1058 } else if(i < MAX_SKB_FRAGS) {
1059 if (copy > PAGE_SIZE)
1060 copy = PAGE_SIZE;
1061 page = alloc_pages(sk->sk_allocation, 0);
1062 if (page == NULL) {
1063 err = -ENOMEM;
1064 goto error;
1065 }
1066 sk->sk_sndmsg_page = page;
1067 sk->sk_sndmsg_off = 0;
1068
1069 skb_fill_page_desc(skb, i, page, 0, 0);
1070 frag = &skb_shinfo(skb)->frags[i];
1071 skb->truesize += PAGE_SIZE;
1072 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1073 } else {
1074 err = -EMSGSIZE;
1075 goto error;
1076 }
1077 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1078 err = -EFAULT;
1079 goto error;
1080 }
1081 sk->sk_sndmsg_off += copy;
1082 frag->size += copy;
1083 skb->len += copy;
1084 skb->data_len += copy;
1085 }
1086 offset += copy;
1087 length -= copy;
1088 }
1089 return 0;
1090error:
1091 inet->cork.length -= length;
1092 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1093 return err;
1094}
1095
1096int ip6_push_pending_frames(struct sock *sk)
1097{
1098 struct sk_buff *skb, *tmp_skb;
1099 struct sk_buff **tail_skb;
1100 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1101 struct inet_sock *inet = inet_sk(sk);
1102 struct ipv6_pinfo *np = inet6_sk(sk);
1103 struct ipv6hdr *hdr;
1104 struct ipv6_txoptions *opt = np->cork.opt;
1105 struct rt6_info *rt = np->cork.rt;
1106 struct flowi *fl = &inet->cork.fl;
1107 unsigned char proto = fl->proto;
1108 int err = 0;
1109
1110 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1111 goto out;
1112 tail_skb = &(skb_shinfo(skb)->frag_list);
1113
1114 /* move skb->data to ip header from ext header */
1115 if (skb->data < skb->nh.raw)
1116 __skb_pull(skb, skb->nh.raw - skb->data);
1117 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1118 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1119 *tail_skb = tmp_skb;
1120 tail_skb = &(tmp_skb->next);
1121 skb->len += tmp_skb->len;
1122 skb->data_len += tmp_skb->len;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001123 skb->truesize += tmp_skb->truesize;
1124 __sock_put(tmp_skb->sk);
1125 tmp_skb->destructor = NULL;
1126 tmp_skb->sk = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001127 }
1128
1129 ipv6_addr_copy(final_dst, &fl->fl6_dst);
1130 __skb_pull(skb, skb->h.raw - skb->nh.raw);
1131 if (opt && opt->opt_flen)
1132 ipv6_push_frag_opts(skb, opt, &proto);
1133 if (opt && opt->opt_nflen)
1134 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1135
1136 skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1137
1138 *(u32*)hdr = fl->fl6_flowlabel | htonl(0x60000000);
1139
1140 if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1141 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1142 else
1143 hdr->payload_len = 0;
1144 hdr->hop_limit = np->cork.hop_limit;
1145 hdr->nexthdr = proto;
1146 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1147 ipv6_addr_copy(&hdr->daddr, final_dst);
1148
1149 skb->dst = dst_clone(&rt->u.dst);
1150 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
1151 err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1152 if (err) {
1153 if (err > 0)
Herbert Xu3320da82005-04-19 22:32:22 -07001154 err = np->recverr ? net_xmit_errno(err) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001155 if (err)
1156 goto error;
1157 }
1158
1159out:
1160 inet->cork.flags &= ~IPCORK_OPT;
1161 if (np->cork.opt) {
1162 kfree(np->cork.opt);
1163 np->cork.opt = NULL;
1164 }
1165 if (np->cork.rt) {
1166 dst_release(&np->cork.rt->u.dst);
1167 np->cork.rt = NULL;
1168 inet->cork.flags &= ~IPCORK_ALLFRAG;
1169 }
1170 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1171 return err;
1172error:
1173 goto out;
1174}
1175
1176void ip6_flush_pending_frames(struct sock *sk)
1177{
1178 struct inet_sock *inet = inet_sk(sk);
1179 struct ipv6_pinfo *np = inet6_sk(sk);
1180 struct sk_buff *skb;
1181
1182 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1183 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1184 kfree_skb(skb);
1185 }
1186
1187 inet->cork.flags &= ~IPCORK_OPT;
1188
1189 if (np->cork.opt) {
1190 kfree(np->cork.opt);
1191 np->cork.opt = NULL;
1192 }
1193 if (np->cork.rt) {
1194 dst_release(&np->cork.rt->u.dst);
1195 np->cork.rt = NULL;
1196 inet->cork.flags &= ~IPCORK_ALLFRAG;
1197 }
1198 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1199}