blob: 590d2b797197f92c8e214396dbebdb2c5498a5be [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * IPv6 output functions
3 * Linux INET6 implementation
4 *
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
7 *
8 * $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9 *
10 * Based on linux/net/ipv4/ip_output.c
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
16 *
17 * Changes:
18 * A.N.Kuznetsov : airthmetics in fragmentation.
19 * extension headers are implemented.
20 * route changes now work.
21 * ip6_forward does not confuse sniffers.
22 * etc.
23 *
24 * H. von Brand : Added missing #include <linux/string.h>
25 * Imran Patel : frag id should be in NBO
26 * Kazunori MIYAZAWA @USAGI
27 * : add ip6_append_data and related functions
28 * for datagram xmit
29 */
30
31#include <linux/config.h>
32#include <linux/errno.h>
33#include <linux/types.h>
34#include <linux/string.h>
35#include <linux/socket.h>
36#include <linux/net.h>
37#include <linux/netdevice.h>
38#include <linux/if_arp.h>
39#include <linux/in6.h>
40#include <linux/tcp.h>
41#include <linux/route.h>
42
43#include <linux/netfilter.h>
44#include <linux/netfilter_ipv6.h>
45
46#include <net/sock.h>
47#include <net/snmp.h>
48
49#include <net/ipv6.h>
50#include <net/ndisc.h>
51#include <net/protocol.h>
52#include <net/ip6_route.h>
53#include <net/addrconf.h>
54#include <net/rawv6.h>
55#include <net/icmp.h>
56#include <net/xfrm.h>
57#include <net/checksum.h>
58
59static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62{
63 static u32 ipv6_fragmentation_id = 1;
64 static DEFINE_SPINLOCK(ip6_id_lock);
65
66 spin_lock_bh(&ip6_id_lock);
67 fhdr->identification = htonl(ipv6_fragmentation_id);
68 if (++ipv6_fragmentation_id == 0)
69 ipv6_fragmentation_id = 1;
70 spin_unlock_bh(&ip6_id_lock);
71}
72
73static inline int ip6_output_finish(struct sk_buff *skb)
74{
75
76 struct dst_entry *dst = skb->dst;
77 struct hh_cache *hh = dst->hh;
78
79 if (hh) {
80 int hh_alen;
81
82 read_lock_bh(&hh->hh_lock);
83 hh_alen = HH_DATA_ALIGN(hh->hh_len);
84 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
85 read_unlock_bh(&hh->hh_lock);
86 skb_push(skb, hh->hh_len);
87 return hh->hh_output(skb);
88 } else if (dst->neighbour)
89 return dst->neighbour->output(skb);
90
91 IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
92 kfree_skb(skb);
93 return -EINVAL;
94
95}
96
97/* dev_loopback_xmit for use with netfilter. */
98static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
99{
100 newskb->mac.raw = newskb->data;
101 __skb_pull(newskb, newskb->nh.raw - newskb->data);
102 newskb->pkt_type = PACKET_LOOPBACK;
103 newskb->ip_summed = CHECKSUM_UNNECESSARY;
104 BUG_TRAP(newskb->dst);
105
106 netif_rx(newskb);
107 return 0;
108}
109
110
111static int ip6_output2(struct sk_buff *skb)
112{
113 struct dst_entry *dst = skb->dst;
114 struct net_device *dev = dst->dev;
115
116 skb->protocol = htons(ETH_P_IPV6);
117 skb->dev = dev;
118
119 if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
120 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
121
122 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
123 ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
124 &skb->nh.ipv6h->saddr)) {
125 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
126
127 /* Do not check for IFF_ALLMULTI; multicast routing
128 is not supported in any case.
129 */
130 if (newskb)
131 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
132 newskb->dev,
133 ip6_dev_loopback_xmit);
134
135 if (skb->nh.ipv6h->hop_limit == 0) {
136 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
137 kfree_skb(skb);
138 return 0;
139 }
140 }
141
142 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
143 }
144
145 return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
146}
147
148int ip6_output(struct sk_buff *skb)
149{
150 if (skb->len > dst_mtu(skb->dst) || dst_allfrag(skb->dst))
151 return ip6_fragment(skb, ip6_output2);
152 else
153 return ip6_output2(skb);
154}
155
156#ifdef CONFIG_NETFILTER
157int ip6_route_me_harder(struct sk_buff *skb)
158{
159 struct ipv6hdr *iph = skb->nh.ipv6h;
160 struct dst_entry *dst;
161 struct flowi fl = {
162 .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0,
163 .nl_u =
164 { .ip6_u =
165 { .daddr = iph->daddr,
166 .saddr = iph->saddr, } },
167 .proto = iph->nexthdr,
168 };
169
170 dst = ip6_route_output(skb->sk, &fl);
171
172 if (dst->error) {
173 IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
174 LIMIT_NETDEBUG(
175 printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n"));
176 dst_release(dst);
177 return -EINVAL;
178 }
179
180 /* Drop old route. */
181 dst_release(skb->dst);
182
183 skb->dst = dst;
184 return 0;
185}
186#endif
187
Linus Torvalds1da177e2005-04-16 15:20:36 -0700188/*
189 * xmit an sk_buff (used by TCP)
190 */
191
192int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
193 struct ipv6_txoptions *opt, int ipfragok)
194{
195 struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
196 struct in6_addr *first_hop = &fl->fl6_dst;
197 struct dst_entry *dst = skb->dst;
198 struct ipv6hdr *hdr;
199 u8 proto = fl->proto;
200 int seg_len = skb->len;
201 int hlimit;
202 u32 mtu;
203
204 if (opt) {
205 int head_room;
206
207 /* First: exthdrs may take lots of space (~8K for now)
208 MAX_HEADER is not enough.
209 */
210 head_room = opt->opt_nflen + opt->opt_flen;
211 seg_len += head_room;
212 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
213
214 if (skb_headroom(skb) < head_room) {
215 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
216 kfree_skb(skb);
217 skb = skb2;
218 if (skb == NULL) {
219 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
220 return -ENOBUFS;
221 }
222 if (sk)
223 skb_set_owner_w(skb, sk);
224 }
225 if (opt->opt_flen)
226 ipv6_push_frag_opts(skb, opt, &proto);
227 if (opt->opt_nflen)
228 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
229 }
230
231 hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
232
233 /*
234 * Fill in the IPv6 header
235 */
236
237 *(u32*)hdr = htonl(0x60000000) | fl->fl6_flowlabel;
238 hlimit = -1;
239 if (np)
240 hlimit = np->hop_limit;
241 if (hlimit < 0)
242 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
243 if (hlimit < 0)
244 hlimit = ipv6_get_hoplimit(dst->dev);
245
246 hdr->payload_len = htons(seg_len);
247 hdr->nexthdr = proto;
248 hdr->hop_limit = hlimit;
249
250 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
251 ipv6_addr_copy(&hdr->daddr, first_hop);
252
253 mtu = dst_mtu(dst);
254 if ((skb->len <= mtu) || ipfragok) {
255 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
Harald Welte6869c4d2005-08-09 19:24:19 -0700256 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev,
257 dst_output);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700258 }
259
260 if (net_ratelimit())
261 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
262 skb->dev = dst->dev;
263 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
264 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
265 kfree_skb(skb);
266 return -EMSGSIZE;
267}
268
269/*
270 * To avoid extra problems ND packets are send through this
271 * routine. It's code duplication but I really want to avoid
272 * extra checks since ipv6_build_header is used by TCP (which
273 * is for us performance critical)
274 */
275
276int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
277 struct in6_addr *saddr, struct in6_addr *daddr,
278 int proto, int len)
279{
280 struct ipv6_pinfo *np = inet6_sk(sk);
281 struct ipv6hdr *hdr;
282 int totlen;
283
284 skb->protocol = htons(ETH_P_IPV6);
285 skb->dev = dev;
286
287 totlen = len + sizeof(struct ipv6hdr);
288
289 hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
290 skb->nh.ipv6h = hdr;
291
292 *(u32*)hdr = htonl(0x60000000);
293
294 hdr->payload_len = htons(len);
295 hdr->nexthdr = proto;
296 hdr->hop_limit = np->hop_limit;
297
298 ipv6_addr_copy(&hdr->saddr, saddr);
299 ipv6_addr_copy(&hdr->daddr, daddr);
300
301 return 0;
302}
303
304static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
305{
306 struct ip6_ra_chain *ra;
307 struct sock *last = NULL;
308
309 read_lock(&ip6_ra_lock);
310 for (ra = ip6_ra_chain; ra; ra = ra->next) {
311 struct sock *sk = ra->sk;
312 if (sk && ra->sel == sel) {
313 if (last) {
314 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
315 if (skb2)
316 rawv6_rcv(last, skb2);
317 }
318 last = sk;
319 }
320 }
321
322 if (last) {
323 rawv6_rcv(last, skb);
324 read_unlock(&ip6_ra_lock);
325 return 1;
326 }
327 read_unlock(&ip6_ra_lock);
328 return 0;
329}
330
331static inline int ip6_forward_finish(struct sk_buff *skb)
332{
333 return dst_output(skb);
334}
335
336int ip6_forward(struct sk_buff *skb)
337{
338 struct dst_entry *dst = skb->dst;
339 struct ipv6hdr *hdr = skb->nh.ipv6h;
340 struct inet6_skb_parm *opt = IP6CB(skb);
341
342 if (ipv6_devconf.forwarding == 0)
343 goto error;
344
345 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
346 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
347 goto drop;
348 }
349
350 skb->ip_summed = CHECKSUM_NONE;
351
352 /*
353 * We DO NOT make any processing on
354 * RA packets, pushing them to user level AS IS
355 * without ane WARRANTY that application will be able
356 * to interpret them. The reason is that we
357 * cannot make anything clever here.
358 *
359 * We are not end-node, so that if packet contains
360 * AH/ESP, we cannot make anything.
361 * Defragmentation also would be mistake, RA packets
362 * cannot be fragmented, because there is no warranty
363 * that different fragments will go along one path. --ANK
364 */
365 if (opt->ra) {
366 u8 *ptr = skb->nh.raw + opt->ra;
367 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
368 return 0;
369 }
370
371 /*
372 * check and decrement ttl
373 */
374 if (hdr->hop_limit <= 1) {
375 /* Force OUTPUT device used as source address */
376 skb->dev = dst->dev;
377 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
378 0, skb->dev);
379
380 kfree_skb(skb);
381 return -ETIMEDOUT;
382 }
383
384 if (!xfrm6_route_forward(skb)) {
385 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
386 goto drop;
387 }
388 dst = skb->dst;
389
390 /* IPv6 specs say nothing about it, but it is clear that we cannot
391 send redirects to source routed frames.
392 */
393 if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
394 struct in6_addr *target = NULL;
395 struct rt6_info *rt;
396 struct neighbour *n = dst->neighbour;
397
398 /*
399 * incoming and outgoing devices are the same
400 * send a redirect.
401 */
402
403 rt = (struct rt6_info *) dst;
404 if ((rt->rt6i_flags & RTF_GATEWAY))
405 target = (struct in6_addr*)&n->primary_key;
406 else
407 target = &hdr->daddr;
408
409 /* Limit redirects both by destination (here)
410 and by source (inside ndisc_send_redirect)
411 */
412 if (xrlim_allow(dst, 1*HZ))
413 ndisc_send_redirect(skb, n, target);
414 } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
415 |IPV6_ADDR_LINKLOCAL)) {
416 /* This check is security critical. */
417 goto error;
418 }
419
420 if (skb->len > dst_mtu(dst)) {
421 /* Again, force OUTPUT device used as source address */
422 skb->dev = dst->dev;
423 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
424 IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
425 IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
426 kfree_skb(skb);
427 return -EMSGSIZE;
428 }
429
430 if (skb_cow(skb, dst->dev->hard_header_len)) {
431 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
432 goto drop;
433 }
434
435 hdr = skb->nh.ipv6h;
436
437 /* Mangling hops number delayed to point after skb COW */
438
439 hdr->hop_limit--;
440
441 IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
442 return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
443
444error:
445 IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
446drop:
447 kfree_skb(skb);
448 return -EINVAL;
449}
450
451static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
452{
453 to->pkt_type = from->pkt_type;
454 to->priority = from->priority;
455 to->protocol = from->protocol;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700456 dst_release(to->dst);
457 to->dst = dst_clone(from->dst);
458 to->dev = from->dev;
459
460#ifdef CONFIG_NET_SCHED
461 to->tc_index = from->tc_index;
462#endif
463#ifdef CONFIG_NETFILTER
464 to->nfmark = from->nfmark;
465 /* Connection association is same as pre-frag packet */
466 to->nfct = from->nfct;
467 nf_conntrack_get(to->nfct);
468 to->nfctinfo = from->nfctinfo;
469#ifdef CONFIG_BRIDGE_NETFILTER
470 nf_bridge_put(to->nf_bridge);
471 to->nf_bridge = from->nf_bridge;
472 nf_bridge_get(to->nf_bridge);
473#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700474#endif
475}
476
477int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
478{
479 u16 offset = sizeof(struct ipv6hdr);
480 struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
481 unsigned int packet_len = skb->tail - skb->nh.raw;
482 int found_rhdr = 0;
483 *nexthdr = &skb->nh.ipv6h->nexthdr;
484
485 while (offset + 1 <= packet_len) {
486
487 switch (**nexthdr) {
488
489 case NEXTHDR_HOP:
490 case NEXTHDR_ROUTING:
491 case NEXTHDR_DEST:
492 if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
493 if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
494 offset += ipv6_optlen(exthdr);
495 *nexthdr = &exthdr->nexthdr;
496 exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
497 break;
498 default :
499 return offset;
500 }
501 }
502
503 return offset;
504}
505
506static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
507{
508 struct net_device *dev;
509 struct sk_buff *frag;
510 struct rt6_info *rt = (struct rt6_info*)skb->dst;
511 struct ipv6hdr *tmp_hdr;
512 struct frag_hdr *fh;
513 unsigned int mtu, hlen, left, len;
514 u32 frag_id = 0;
515 int ptr, offset = 0, err=0;
516 u8 *prevhdr, nexthdr = 0;
517
518 dev = rt->u.dst.dev;
519 hlen = ip6_find_1stfragopt(skb, &prevhdr);
520 nexthdr = *prevhdr;
521
522 mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
523
524 if (skb_shinfo(skb)->frag_list) {
525 int first_len = skb_pagelen(skb);
526
527 if (first_len - hlen > mtu ||
528 ((first_len - hlen) & 7) ||
529 skb_cloned(skb))
530 goto slow_path;
531
532 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
533 /* Correct geometry. */
534 if (frag->len > mtu ||
535 ((frag->len & 7) && frag->next) ||
536 skb_headroom(frag) < hlen)
537 goto slow_path;
538
Linus Torvalds1da177e2005-04-16 15:20:36 -0700539 /* Partially cloned skb? */
540 if (skb_shared(frag))
541 goto slow_path;
Herbert Xu2fdba6b2005-05-18 22:52:33 -0700542
543 BUG_ON(frag->sk);
544 if (skb->sk) {
545 sock_hold(skb->sk);
546 frag->sk = skb->sk;
547 frag->destructor = sock_wfree;
548 skb->truesize -= frag->truesize;
549 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700550 }
551
552 err = 0;
553 offset = 0;
554 frag = skb_shinfo(skb)->frag_list;
555 skb_shinfo(skb)->frag_list = NULL;
556 /* BUILD HEADER */
557
558 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
559 if (!tmp_hdr) {
560 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
561 return -ENOMEM;
562 }
563
564 *prevhdr = NEXTHDR_FRAGMENT;
565 memcpy(tmp_hdr, skb->nh.raw, hlen);
566 __skb_pull(skb, hlen);
567 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
568 skb->nh.raw = __skb_push(skb, hlen);
569 memcpy(skb->nh.raw, tmp_hdr, hlen);
570
571 ipv6_select_ident(skb, fh);
572 fh->nexthdr = nexthdr;
573 fh->reserved = 0;
574 fh->frag_off = htons(IP6_MF);
575 frag_id = fh->identification;
576
577 first_len = skb_pagelen(skb);
578 skb->data_len = first_len - skb_headlen(skb);
579 skb->len = first_len;
580 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
581
582
583 for (;;) {
584 /* Prepare header of the next frame,
585 * before previous one went down. */
586 if (frag) {
587 frag->ip_summed = CHECKSUM_NONE;
588 frag->h.raw = frag->data;
589 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
590 frag->nh.raw = __skb_push(frag, hlen);
591 memcpy(frag->nh.raw, tmp_hdr, hlen);
592 offset += skb->len - hlen - sizeof(struct frag_hdr);
593 fh->nexthdr = nexthdr;
594 fh->reserved = 0;
595 fh->frag_off = htons(offset);
596 if (frag->next != NULL)
597 fh->frag_off |= htons(IP6_MF);
598 fh->identification = frag_id;
599 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
600 ip6_copy_metadata(frag, skb);
601 }
602
603 err = output(skb);
604 if (err || !frag)
605 break;
606
607 skb = frag;
608 frag = skb->next;
609 skb->next = NULL;
610 }
611
612 if (tmp_hdr)
613 kfree(tmp_hdr);
614
615 if (err == 0) {
616 IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
617 return 0;
618 }
619
620 while (frag) {
621 skb = frag->next;
622 kfree_skb(frag);
623 frag = skb;
624 }
625
626 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
627 return err;
628 }
629
630slow_path:
631 left = skb->len - hlen; /* Space per frame */
632 ptr = hlen; /* Where to start from */
633
634 /*
635 * Fragment the datagram.
636 */
637
638 *prevhdr = NEXTHDR_FRAGMENT;
639
640 /*
641 * Keep copying data until we run out.
642 */
643 while(left > 0) {
644 len = left;
645 /* IF: it doesn't fit, use 'mtu' - the data space left */
646 if (len > mtu)
647 len = mtu;
648 /* IF: we are not sending upto and including the packet end
649 then align the next start on an eight byte boundary */
650 if (len < left) {
651 len &= ~7;
652 }
653 /*
654 * Allocate buffer.
655 */
656
657 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
658 NETDEBUG(printk(KERN_INFO "IPv6: frag: no memory for new fragment!\n"));
659 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
660 err = -ENOMEM;
661 goto fail;
662 }
663
664 /*
665 * Set up data on packet
666 */
667
668 ip6_copy_metadata(frag, skb);
669 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
670 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
671 frag->nh.raw = frag->data;
672 fh = (struct frag_hdr*)(frag->data + hlen);
673 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
674
675 /*
676 * Charge the memory for the fragment to any owner
677 * it might possess
678 */
679 if (skb->sk)
680 skb_set_owner_w(frag, skb->sk);
681
682 /*
683 * Copy the packet header into the new buffer.
684 */
685 memcpy(frag->nh.raw, skb->data, hlen);
686
687 /*
688 * Build fragment header.
689 */
690 fh->nexthdr = nexthdr;
691 fh->reserved = 0;
692 if (frag_id) {
693 ipv6_select_ident(skb, fh);
694 frag_id = fh->identification;
695 } else
696 fh->identification = frag_id;
697
698 /*
699 * Copy a block of the IP datagram.
700 */
701 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
702 BUG();
703 left -= len;
704
705 fh->frag_off = htons(offset);
706 if (left > 0)
707 fh->frag_off |= htons(IP6_MF);
708 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
709
710 ptr += len;
711 offset += len;
712
713 /*
714 * Put this fragment into the sending queue.
715 */
716
717 IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
718
719 err = output(frag);
720 if (err)
721 goto fail;
722 }
723 kfree_skb(skb);
724 IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
725 return err;
726
727fail:
728 kfree_skb(skb);
729 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
730 return err;
731}
732
733int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
734{
735 int err = 0;
736
737 *dst = NULL;
738 if (sk) {
739 struct ipv6_pinfo *np = inet6_sk(sk);
740
741 *dst = sk_dst_check(sk, np->dst_cookie);
742 if (*dst) {
743 struct rt6_info *rt = (struct rt6_info*)*dst;
744
745 /* Yes, checking route validity in not connected
746 case is not very simple. Take into account,
747 that we do not support routing by source, TOS,
748 and MSG_DONTROUTE --ANK (980726)
749
750 1. If route was host route, check that
751 cached destination is current.
752 If it is network route, we still may
753 check its validity using saved pointer
754 to the last used address: daddr_cache.
755 We do not want to save whole address now,
756 (because main consumer of this service
757 is tcp, which has not this problem),
758 so that the last trick works only on connected
759 sockets.
760 2. oif also should be the same.
761 */
762
763 if (((rt->rt6i_dst.plen != 128 ||
764 !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
765 && (np->daddr_cache == NULL ||
766 !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
767 || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
768 dst_release(*dst);
769 *dst = NULL;
770 }
771 }
772 }
773
774 if (*dst == NULL)
775 *dst = ip6_route_output(sk, fl);
776
777 if ((err = (*dst)->error))
778 goto out_err_release;
779
780 if (ipv6_addr_any(&fl->fl6_src)) {
781 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
782
Olaf Hering44456d32005-07-27 11:45:17 -0700783 if (err)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700784 goto out_err_release;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700785 }
786
787 return 0;
788
789out_err_release:
790 dst_release(*dst);
791 *dst = NULL;
792 return err;
793}
794
795int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb),
796 void *from, int length, int transhdrlen,
797 int hlimit, struct ipv6_txoptions *opt, struct flowi *fl, struct rt6_info *rt,
798 unsigned int flags)
799{
800 struct inet_sock *inet = inet_sk(sk);
801 struct ipv6_pinfo *np = inet6_sk(sk);
802 struct sk_buff *skb;
803 unsigned int maxfraglen, fragheaderlen;
804 int exthdrlen;
805 int hh_len;
806 int mtu;
807 int copy;
808 int err;
809 int offset = 0;
810 int csummode = CHECKSUM_NONE;
811
812 if (flags&MSG_PROBE)
813 return 0;
814 if (skb_queue_empty(&sk->sk_write_queue)) {
815 /*
816 * setup for corking
817 */
818 if (opt) {
819 if (np->cork.opt == NULL) {
820 np->cork.opt = kmalloc(opt->tot_len,
821 sk->sk_allocation);
822 if (unlikely(np->cork.opt == NULL))
823 return -ENOBUFS;
824 } else if (np->cork.opt->tot_len < opt->tot_len) {
825 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
826 return -EINVAL;
827 }
828 memcpy(np->cork.opt, opt, opt->tot_len);
829 inet->cork.flags |= IPCORK_OPT;
830 /* need source address above miyazawa*/
831 }
832 dst_hold(&rt->u.dst);
833 np->cork.rt = rt;
834 inet->cork.fl = *fl;
835 np->cork.hop_limit = hlimit;
836 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
837 if (dst_allfrag(rt->u.dst.path))
838 inet->cork.flags |= IPCORK_ALLFRAG;
839 inet->cork.length = 0;
840 sk->sk_sndmsg_page = NULL;
841 sk->sk_sndmsg_off = 0;
842 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
843 length += exthdrlen;
844 transhdrlen += exthdrlen;
845 } else {
846 rt = np->cork.rt;
847 fl = &inet->cork.fl;
848 if (inet->cork.flags & IPCORK_OPT)
849 opt = np->cork.opt;
850 transhdrlen = 0;
851 exthdrlen = 0;
852 mtu = inet->cork.fragsize;
853 }
854
855 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
856
857 fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
858 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
859
860 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
861 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
862 ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
863 return -EMSGSIZE;
864 }
865 }
866
867 /*
868 * Let's try using as much space as possible.
869 * Use MTU if total length of the message fits into the MTU.
870 * Otherwise, we need to reserve fragment header and
871 * fragment alignment (= 8-15 octects, in total).
872 *
873 * Note that we may need to "move" the data from the tail of
874 * of the buffer to the new fragment when we split
875 * the message.
876 *
877 * FIXME: It may be fragmented into multiple chunks
878 * at once if non-fragmentable extension headers
879 * are too large.
880 * --yoshfuji
881 */
882
883 inet->cork.length += length;
884
885 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
886 goto alloc_new_skb;
887
888 while (length > 0) {
889 /* Check if the remaining data fits into current packet. */
890 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
891 if (copy < length)
892 copy = maxfraglen - skb->len;
893
894 if (copy <= 0) {
895 char *data;
896 unsigned int datalen;
897 unsigned int fraglen;
898 unsigned int fraggap;
899 unsigned int alloclen;
900 struct sk_buff *skb_prev;
901alloc_new_skb:
902 skb_prev = skb;
903
904 /* There's no room in the current skb */
905 if (skb_prev)
906 fraggap = skb_prev->len - maxfraglen;
907 else
908 fraggap = 0;
909
910 /*
911 * If remaining data exceeds the mtu,
912 * we know we need more fragment(s).
913 */
914 datalen = length + fraggap;
915 if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
916 datalen = maxfraglen - fragheaderlen;
917
918 fraglen = datalen + fragheaderlen;
919 if ((flags & MSG_MORE) &&
920 !(rt->u.dst.dev->features&NETIF_F_SG))
921 alloclen = mtu;
922 else
923 alloclen = datalen + fragheaderlen;
924
925 /*
926 * The last fragment gets additional space at tail.
927 * Note: we overallocate on fragments with MSG_MODE
928 * because we have no idea if we're the last one.
929 */
930 if (datalen == length + fraggap)
931 alloclen += rt->u.dst.trailer_len;
932
933 /*
934 * We just reserve space for fragment header.
935 * Note: this may be overallocation if the message
936 * (without MSG_MORE) fits into the MTU.
937 */
938 alloclen += sizeof(struct frag_hdr);
939
940 if (transhdrlen) {
941 skb = sock_alloc_send_skb(sk,
942 alloclen + hh_len,
943 (flags & MSG_DONTWAIT), &err);
944 } else {
945 skb = NULL;
946 if (atomic_read(&sk->sk_wmem_alloc) <=
947 2 * sk->sk_sndbuf)
948 skb = sock_wmalloc(sk,
949 alloclen + hh_len, 1,
950 sk->sk_allocation);
951 if (unlikely(skb == NULL))
952 err = -ENOBUFS;
953 }
954 if (skb == NULL)
955 goto error;
956 /*
957 * Fill in the control structures
958 */
959 skb->ip_summed = csummode;
960 skb->csum = 0;
961 /* reserve for fragmentation */
962 skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
963
964 /*
965 * Find where to start putting bytes
966 */
967 data = skb_put(skb, fraglen);
968 skb->nh.raw = data + exthdrlen;
969 data += fragheaderlen;
970 skb->h.raw = data + exthdrlen;
971
972 if (fraggap) {
973 skb->csum = skb_copy_and_csum_bits(
974 skb_prev, maxfraglen,
975 data + transhdrlen, fraggap, 0);
976 skb_prev->csum = csum_sub(skb_prev->csum,
977 skb->csum);
978 data += fraggap;
979 skb_trim(skb_prev, maxfraglen);
980 }
981 copy = datalen - transhdrlen - fraggap;
982 if (copy < 0) {
983 err = -EINVAL;
984 kfree_skb(skb);
985 goto error;
986 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
987 err = -EFAULT;
988 kfree_skb(skb);
989 goto error;
990 }
991
992 offset += copy;
993 length -= datalen - fraggap;
994 transhdrlen = 0;
995 exthdrlen = 0;
996 csummode = CHECKSUM_NONE;
997
998 /*
999 * Put the packet on the pending queue
1000 */
1001 __skb_queue_tail(&sk->sk_write_queue, skb);
1002 continue;
1003 }
1004
1005 if (copy > length)
1006 copy = length;
1007
1008 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1009 unsigned int off;
1010
1011 off = skb->len;
1012 if (getfrag(from, skb_put(skb, copy),
1013 offset, copy, off, skb) < 0) {
1014 __skb_trim(skb, off);
1015 err = -EFAULT;
1016 goto error;
1017 }
1018 } else {
1019 int i = skb_shinfo(skb)->nr_frags;
1020 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1021 struct page *page = sk->sk_sndmsg_page;
1022 int off = sk->sk_sndmsg_off;
1023 unsigned int left;
1024
1025 if (page && (left = PAGE_SIZE - off) > 0) {
1026 if (copy >= left)
1027 copy = left;
1028 if (page != frag->page) {
1029 if (i == MAX_SKB_FRAGS) {
1030 err = -EMSGSIZE;
1031 goto error;
1032 }
1033 get_page(page);
1034 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1035 frag = &skb_shinfo(skb)->frags[i];
1036 }
1037 } else if(i < MAX_SKB_FRAGS) {
1038 if (copy > PAGE_SIZE)
1039 copy = PAGE_SIZE;
1040 page = alloc_pages(sk->sk_allocation, 0);
1041 if (page == NULL) {
1042 err = -ENOMEM;
1043 goto error;
1044 }
1045 sk->sk_sndmsg_page = page;
1046 sk->sk_sndmsg_off = 0;
1047
1048 skb_fill_page_desc(skb, i, page, 0, 0);
1049 frag = &skb_shinfo(skb)->frags[i];
1050 skb->truesize += PAGE_SIZE;
1051 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1052 } else {
1053 err = -EMSGSIZE;
1054 goto error;
1055 }
1056 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1057 err = -EFAULT;
1058 goto error;
1059 }
1060 sk->sk_sndmsg_off += copy;
1061 frag->size += copy;
1062 skb->len += copy;
1063 skb->data_len += copy;
1064 }
1065 offset += copy;
1066 length -= copy;
1067 }
1068 return 0;
1069error:
1070 inet->cork.length -= length;
1071 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1072 return err;
1073}
1074
1075int ip6_push_pending_frames(struct sock *sk)
1076{
1077 struct sk_buff *skb, *tmp_skb;
1078 struct sk_buff **tail_skb;
1079 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1080 struct inet_sock *inet = inet_sk(sk);
1081 struct ipv6_pinfo *np = inet6_sk(sk);
1082 struct ipv6hdr *hdr;
1083 struct ipv6_txoptions *opt = np->cork.opt;
1084 struct rt6_info *rt = np->cork.rt;
1085 struct flowi *fl = &inet->cork.fl;
1086 unsigned char proto = fl->proto;
1087 int err = 0;
1088
1089 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1090 goto out;
1091 tail_skb = &(skb_shinfo(skb)->frag_list);
1092
1093 /* move skb->data to ip header from ext header */
1094 if (skb->data < skb->nh.raw)
1095 __skb_pull(skb, skb->nh.raw - skb->data);
1096 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1097 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1098 *tail_skb = tmp_skb;
1099 tail_skb = &(tmp_skb->next);
1100 skb->len += tmp_skb->len;
1101 skb->data_len += tmp_skb->len;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001102 skb->truesize += tmp_skb->truesize;
1103 __sock_put(tmp_skb->sk);
1104 tmp_skb->destructor = NULL;
1105 tmp_skb->sk = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001106 }
1107
1108 ipv6_addr_copy(final_dst, &fl->fl6_dst);
1109 __skb_pull(skb, skb->h.raw - skb->nh.raw);
1110 if (opt && opt->opt_flen)
1111 ipv6_push_frag_opts(skb, opt, &proto);
1112 if (opt && opt->opt_nflen)
1113 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1114
1115 skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1116
1117 *(u32*)hdr = fl->fl6_flowlabel | htonl(0x60000000);
1118
1119 if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1120 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1121 else
1122 hdr->payload_len = 0;
1123 hdr->hop_limit = np->cork.hop_limit;
1124 hdr->nexthdr = proto;
1125 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1126 ipv6_addr_copy(&hdr->daddr, final_dst);
1127
1128 skb->dst = dst_clone(&rt->u.dst);
1129 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
1130 err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1131 if (err) {
1132 if (err > 0)
Herbert Xu3320da82005-04-19 22:32:22 -07001133 err = np->recverr ? net_xmit_errno(err) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001134 if (err)
1135 goto error;
1136 }
1137
1138out:
1139 inet->cork.flags &= ~IPCORK_OPT;
1140 if (np->cork.opt) {
1141 kfree(np->cork.opt);
1142 np->cork.opt = NULL;
1143 }
1144 if (np->cork.rt) {
1145 dst_release(&np->cork.rt->u.dst);
1146 np->cork.rt = NULL;
1147 inet->cork.flags &= ~IPCORK_ALLFRAG;
1148 }
1149 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1150 return err;
1151error:
1152 goto out;
1153}
1154
1155void ip6_flush_pending_frames(struct sock *sk)
1156{
1157 struct inet_sock *inet = inet_sk(sk);
1158 struct ipv6_pinfo *np = inet6_sk(sk);
1159 struct sk_buff *skb;
1160
1161 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1162 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1163 kfree_skb(skb);
1164 }
1165
1166 inet->cork.flags &= ~IPCORK_OPT;
1167
1168 if (np->cork.opt) {
1169 kfree(np->cork.opt);
1170 np->cork.opt = NULL;
1171 }
1172 if (np->cork.rt) {
1173 dst_release(&np->cork.rt->u.dst);
1174 np->cork.rt = NULL;
1175 inet->cork.flags &= ~IPCORK_ALLFRAG;
1176 }
1177 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1178}