blob: 01ef94f7c7f1ce5dfd899703d287be308376a83c [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * IPv6 output functions
3 * Linux INET6 implementation
4 *
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
7 *
8 * $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9 *
10 * Based on linux/net/ipv4/ip_output.c
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
16 *
17 * Changes:
18 * A.N.Kuznetsov : airthmetics in fragmentation.
19 * extension headers are implemented.
20 * route changes now work.
21 * ip6_forward does not confuse sniffers.
22 * etc.
23 *
24 * H. von Brand : Added missing #include <linux/string.h>
25 * Imran Patel : frag id should be in NBO
26 * Kazunori MIYAZAWA @USAGI
27 * : add ip6_append_data and related functions
28 * for datagram xmit
29 */
30
31#include <linux/config.h>
32#include <linux/errno.h>
33#include <linux/types.h>
34#include <linux/string.h>
35#include <linux/socket.h>
36#include <linux/net.h>
37#include <linux/netdevice.h>
38#include <linux/if_arp.h>
39#include <linux/in6.h>
40#include <linux/tcp.h>
41#include <linux/route.h>
42
43#include <linux/netfilter.h>
44#include <linux/netfilter_ipv6.h>
45
46#include <net/sock.h>
47#include <net/snmp.h>
48
49#include <net/ipv6.h>
50#include <net/ndisc.h>
51#include <net/protocol.h>
52#include <net/ip6_route.h>
53#include <net/addrconf.h>
54#include <net/rawv6.h>
55#include <net/icmp.h>
56#include <net/xfrm.h>
57#include <net/checksum.h>
58
59static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62{
63 static u32 ipv6_fragmentation_id = 1;
64 static DEFINE_SPINLOCK(ip6_id_lock);
65
66 spin_lock_bh(&ip6_id_lock);
67 fhdr->identification = htonl(ipv6_fragmentation_id);
68 if (++ipv6_fragmentation_id == 0)
69 ipv6_fragmentation_id = 1;
70 spin_unlock_bh(&ip6_id_lock);
71}
72
73static inline int ip6_output_finish(struct sk_buff *skb)
74{
75
76 struct dst_entry *dst = skb->dst;
77 struct hh_cache *hh = dst->hh;
78
79 if (hh) {
80 int hh_alen;
81
82 read_lock_bh(&hh->hh_lock);
83 hh_alen = HH_DATA_ALIGN(hh->hh_len);
84 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
85 read_unlock_bh(&hh->hh_lock);
86 skb_push(skb, hh->hh_len);
87 return hh->hh_output(skb);
88 } else if (dst->neighbour)
89 return dst->neighbour->output(skb);
90
91 IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
92 kfree_skb(skb);
93 return -EINVAL;
94
95}
96
97/* dev_loopback_xmit for use with netfilter. */
98static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
99{
100 newskb->mac.raw = newskb->data;
101 __skb_pull(newskb, newskb->nh.raw - newskb->data);
102 newskb->pkt_type = PACKET_LOOPBACK;
103 newskb->ip_summed = CHECKSUM_UNNECESSARY;
104 BUG_TRAP(newskb->dst);
105
106 netif_rx(newskb);
107 return 0;
108}
109
110
111static int ip6_output2(struct sk_buff *skb)
112{
113 struct dst_entry *dst = skb->dst;
114 struct net_device *dev = dst->dev;
115
116 skb->protocol = htons(ETH_P_IPV6);
117 skb->dev = dev;
118
119 if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
120 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
121
122 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
123 ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
124 &skb->nh.ipv6h->saddr)) {
125 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
126
127 /* Do not check for IFF_ALLMULTI; multicast routing
128 is not supported in any case.
129 */
130 if (newskb)
131 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
132 newskb->dev,
133 ip6_dev_loopback_xmit);
134
135 if (skb->nh.ipv6h->hop_limit == 0) {
136 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
137 kfree_skb(skb);
138 return 0;
139 }
140 }
141
142 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
143 }
144
145 return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
146}
147
148int ip6_output(struct sk_buff *skb)
149{
150 if (skb->len > dst_mtu(skb->dst) || dst_allfrag(skb->dst))
151 return ip6_fragment(skb, ip6_output2);
152 else
153 return ip6_output2(skb);
154}
155
Linus Torvalds1da177e2005-04-16 15:20:36 -0700156/*
157 * xmit an sk_buff (used by TCP)
158 */
159
160int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
161 struct ipv6_txoptions *opt, int ipfragok)
162{
163 struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
164 struct in6_addr *first_hop = &fl->fl6_dst;
165 struct dst_entry *dst = skb->dst;
166 struct ipv6hdr *hdr;
167 u8 proto = fl->proto;
168 int seg_len = skb->len;
169 int hlimit;
170 u32 mtu;
171
172 if (opt) {
173 int head_room;
174
175 /* First: exthdrs may take lots of space (~8K for now)
176 MAX_HEADER is not enough.
177 */
178 head_room = opt->opt_nflen + opt->opt_flen;
179 seg_len += head_room;
180 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
181
182 if (skb_headroom(skb) < head_room) {
183 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
184 kfree_skb(skb);
185 skb = skb2;
186 if (skb == NULL) {
187 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
188 return -ENOBUFS;
189 }
190 if (sk)
191 skb_set_owner_w(skb, sk);
192 }
193 if (opt->opt_flen)
194 ipv6_push_frag_opts(skb, opt, &proto);
195 if (opt->opt_nflen)
196 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
197 }
198
199 hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
200
201 /*
202 * Fill in the IPv6 header
203 */
204
205 *(u32*)hdr = htonl(0x60000000) | fl->fl6_flowlabel;
206 hlimit = -1;
207 if (np)
208 hlimit = np->hop_limit;
209 if (hlimit < 0)
210 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
211 if (hlimit < 0)
212 hlimit = ipv6_get_hoplimit(dst->dev);
213
214 hdr->payload_len = htons(seg_len);
215 hdr->nexthdr = proto;
216 hdr->hop_limit = hlimit;
217
218 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
219 ipv6_addr_copy(&hdr->daddr, first_hop);
220
221 mtu = dst_mtu(dst);
222 if ((skb->len <= mtu) || ipfragok) {
223 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
Harald Welte6869c4d2005-08-09 19:24:19 -0700224 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev,
225 dst_output);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700226 }
227
228 if (net_ratelimit())
229 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
230 skb->dev = dst->dev;
231 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
232 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
233 kfree_skb(skb);
234 return -EMSGSIZE;
235}
236
237/*
238 * To avoid extra problems ND packets are send through this
239 * routine. It's code duplication but I really want to avoid
240 * extra checks since ipv6_build_header is used by TCP (which
241 * is for us performance critical)
242 */
243
244int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
245 struct in6_addr *saddr, struct in6_addr *daddr,
246 int proto, int len)
247{
248 struct ipv6_pinfo *np = inet6_sk(sk);
249 struct ipv6hdr *hdr;
250 int totlen;
251
252 skb->protocol = htons(ETH_P_IPV6);
253 skb->dev = dev;
254
255 totlen = len + sizeof(struct ipv6hdr);
256
257 hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
258 skb->nh.ipv6h = hdr;
259
260 *(u32*)hdr = htonl(0x60000000);
261
262 hdr->payload_len = htons(len);
263 hdr->nexthdr = proto;
264 hdr->hop_limit = np->hop_limit;
265
266 ipv6_addr_copy(&hdr->saddr, saddr);
267 ipv6_addr_copy(&hdr->daddr, daddr);
268
269 return 0;
270}
271
272static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
273{
274 struct ip6_ra_chain *ra;
275 struct sock *last = NULL;
276
277 read_lock(&ip6_ra_lock);
278 for (ra = ip6_ra_chain; ra; ra = ra->next) {
279 struct sock *sk = ra->sk;
Andrew McDonald0bd1b592005-08-09 19:44:42 -0700280 if (sk && ra->sel == sel &&
281 (!sk->sk_bound_dev_if ||
282 sk->sk_bound_dev_if == skb->dev->ifindex)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700283 if (last) {
284 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
285 if (skb2)
286 rawv6_rcv(last, skb2);
287 }
288 last = sk;
289 }
290 }
291
292 if (last) {
293 rawv6_rcv(last, skb);
294 read_unlock(&ip6_ra_lock);
295 return 1;
296 }
297 read_unlock(&ip6_ra_lock);
298 return 0;
299}
300
301static inline int ip6_forward_finish(struct sk_buff *skb)
302{
303 return dst_output(skb);
304}
305
306int ip6_forward(struct sk_buff *skb)
307{
308 struct dst_entry *dst = skb->dst;
309 struct ipv6hdr *hdr = skb->nh.ipv6h;
310 struct inet6_skb_parm *opt = IP6CB(skb);
311
312 if (ipv6_devconf.forwarding == 0)
313 goto error;
314
315 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
316 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
317 goto drop;
318 }
319
320 skb->ip_summed = CHECKSUM_NONE;
321
322 /*
323 * We DO NOT make any processing on
324 * RA packets, pushing them to user level AS IS
325 * without ane WARRANTY that application will be able
326 * to interpret them. The reason is that we
327 * cannot make anything clever here.
328 *
329 * We are not end-node, so that if packet contains
330 * AH/ESP, we cannot make anything.
331 * Defragmentation also would be mistake, RA packets
332 * cannot be fragmented, because there is no warranty
333 * that different fragments will go along one path. --ANK
334 */
335 if (opt->ra) {
336 u8 *ptr = skb->nh.raw + opt->ra;
337 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
338 return 0;
339 }
340
341 /*
342 * check and decrement ttl
343 */
344 if (hdr->hop_limit <= 1) {
345 /* Force OUTPUT device used as source address */
346 skb->dev = dst->dev;
347 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
348 0, skb->dev);
349
350 kfree_skb(skb);
351 return -ETIMEDOUT;
352 }
353
354 if (!xfrm6_route_forward(skb)) {
355 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
356 goto drop;
357 }
358 dst = skb->dst;
359
360 /* IPv6 specs say nothing about it, but it is clear that we cannot
361 send redirects to source routed frames.
362 */
363 if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
364 struct in6_addr *target = NULL;
365 struct rt6_info *rt;
366 struct neighbour *n = dst->neighbour;
367
368 /*
369 * incoming and outgoing devices are the same
370 * send a redirect.
371 */
372
373 rt = (struct rt6_info *) dst;
374 if ((rt->rt6i_flags & RTF_GATEWAY))
375 target = (struct in6_addr*)&n->primary_key;
376 else
377 target = &hdr->daddr;
378
379 /* Limit redirects both by destination (here)
380 and by source (inside ndisc_send_redirect)
381 */
382 if (xrlim_allow(dst, 1*HZ))
383 ndisc_send_redirect(skb, n, target);
384 } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
385 |IPV6_ADDR_LINKLOCAL)) {
386 /* This check is security critical. */
387 goto error;
388 }
389
390 if (skb->len > dst_mtu(dst)) {
391 /* Again, force OUTPUT device used as source address */
392 skb->dev = dst->dev;
393 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
394 IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
395 IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
396 kfree_skb(skb);
397 return -EMSGSIZE;
398 }
399
400 if (skb_cow(skb, dst->dev->hard_header_len)) {
401 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
402 goto drop;
403 }
404
405 hdr = skb->nh.ipv6h;
406
407 /* Mangling hops number delayed to point after skb COW */
408
409 hdr->hop_limit--;
410
411 IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
412 return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
413
414error:
415 IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
416drop:
417 kfree_skb(skb);
418 return -EINVAL;
419}
420
421static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
422{
423 to->pkt_type = from->pkt_type;
424 to->priority = from->priority;
425 to->protocol = from->protocol;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700426 dst_release(to->dst);
427 to->dst = dst_clone(from->dst);
428 to->dev = from->dev;
429
430#ifdef CONFIG_NET_SCHED
431 to->tc_index = from->tc_index;
432#endif
433#ifdef CONFIG_NETFILTER
434 to->nfmark = from->nfmark;
435 /* Connection association is same as pre-frag packet */
436 to->nfct = from->nfct;
437 nf_conntrack_get(to->nfct);
438 to->nfctinfo = from->nfctinfo;
439#ifdef CONFIG_BRIDGE_NETFILTER
440 nf_bridge_put(to->nf_bridge);
441 to->nf_bridge = from->nf_bridge;
442 nf_bridge_get(to->nf_bridge);
443#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700444#endif
445}
446
447int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
448{
449 u16 offset = sizeof(struct ipv6hdr);
450 struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
451 unsigned int packet_len = skb->tail - skb->nh.raw;
452 int found_rhdr = 0;
453 *nexthdr = &skb->nh.ipv6h->nexthdr;
454
455 while (offset + 1 <= packet_len) {
456
457 switch (**nexthdr) {
458
459 case NEXTHDR_HOP:
460 case NEXTHDR_ROUTING:
461 case NEXTHDR_DEST:
462 if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
463 if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
464 offset += ipv6_optlen(exthdr);
465 *nexthdr = &exthdr->nexthdr;
466 exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
467 break;
468 default :
469 return offset;
470 }
471 }
472
473 return offset;
474}
475
476static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
477{
478 struct net_device *dev;
479 struct sk_buff *frag;
480 struct rt6_info *rt = (struct rt6_info*)skb->dst;
481 struct ipv6hdr *tmp_hdr;
482 struct frag_hdr *fh;
483 unsigned int mtu, hlen, left, len;
484 u32 frag_id = 0;
485 int ptr, offset = 0, err=0;
486 u8 *prevhdr, nexthdr = 0;
487
488 dev = rt->u.dst.dev;
489 hlen = ip6_find_1stfragopt(skb, &prevhdr);
490 nexthdr = *prevhdr;
491
492 mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
493
494 if (skb_shinfo(skb)->frag_list) {
495 int first_len = skb_pagelen(skb);
496
497 if (first_len - hlen > mtu ||
498 ((first_len - hlen) & 7) ||
499 skb_cloned(skb))
500 goto slow_path;
501
502 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
503 /* Correct geometry. */
504 if (frag->len > mtu ||
505 ((frag->len & 7) && frag->next) ||
506 skb_headroom(frag) < hlen)
507 goto slow_path;
508
Linus Torvalds1da177e2005-04-16 15:20:36 -0700509 /* Partially cloned skb? */
510 if (skb_shared(frag))
511 goto slow_path;
Herbert Xu2fdba6b2005-05-18 22:52:33 -0700512
513 BUG_ON(frag->sk);
514 if (skb->sk) {
515 sock_hold(skb->sk);
516 frag->sk = skb->sk;
517 frag->destructor = sock_wfree;
518 skb->truesize -= frag->truesize;
519 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700520 }
521
522 err = 0;
523 offset = 0;
524 frag = skb_shinfo(skb)->frag_list;
525 skb_shinfo(skb)->frag_list = NULL;
526 /* BUILD HEADER */
527
528 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
529 if (!tmp_hdr) {
530 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
531 return -ENOMEM;
532 }
533
534 *prevhdr = NEXTHDR_FRAGMENT;
535 memcpy(tmp_hdr, skb->nh.raw, hlen);
536 __skb_pull(skb, hlen);
537 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
538 skb->nh.raw = __skb_push(skb, hlen);
539 memcpy(skb->nh.raw, tmp_hdr, hlen);
540
541 ipv6_select_ident(skb, fh);
542 fh->nexthdr = nexthdr;
543 fh->reserved = 0;
544 fh->frag_off = htons(IP6_MF);
545 frag_id = fh->identification;
546
547 first_len = skb_pagelen(skb);
548 skb->data_len = first_len - skb_headlen(skb);
549 skb->len = first_len;
550 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
551
552
553 for (;;) {
554 /* Prepare header of the next frame,
555 * before previous one went down. */
556 if (frag) {
557 frag->ip_summed = CHECKSUM_NONE;
558 frag->h.raw = frag->data;
559 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
560 frag->nh.raw = __skb_push(frag, hlen);
561 memcpy(frag->nh.raw, tmp_hdr, hlen);
562 offset += skb->len - hlen - sizeof(struct frag_hdr);
563 fh->nexthdr = nexthdr;
564 fh->reserved = 0;
565 fh->frag_off = htons(offset);
566 if (frag->next != NULL)
567 fh->frag_off |= htons(IP6_MF);
568 fh->identification = frag_id;
569 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
570 ip6_copy_metadata(frag, skb);
571 }
572
573 err = output(skb);
574 if (err || !frag)
575 break;
576
577 skb = frag;
578 frag = skb->next;
579 skb->next = NULL;
580 }
581
582 if (tmp_hdr)
583 kfree(tmp_hdr);
584
585 if (err == 0) {
586 IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
587 return 0;
588 }
589
590 while (frag) {
591 skb = frag->next;
592 kfree_skb(frag);
593 frag = skb;
594 }
595
596 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
597 return err;
598 }
599
600slow_path:
601 left = skb->len - hlen; /* Space per frame */
602 ptr = hlen; /* Where to start from */
603
604 /*
605 * Fragment the datagram.
606 */
607
608 *prevhdr = NEXTHDR_FRAGMENT;
609
610 /*
611 * Keep copying data until we run out.
612 */
613 while(left > 0) {
614 len = left;
615 /* IF: it doesn't fit, use 'mtu' - the data space left */
616 if (len > mtu)
617 len = mtu;
618 /* IF: we are not sending upto and including the packet end
619 then align the next start on an eight byte boundary */
620 if (len < left) {
621 len &= ~7;
622 }
623 /*
624 * Allocate buffer.
625 */
626
627 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
Patrick McHardy64ce2072005-08-09 20:50:53 -0700628 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700629 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
630 err = -ENOMEM;
631 goto fail;
632 }
633
634 /*
635 * Set up data on packet
636 */
637
638 ip6_copy_metadata(frag, skb);
639 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
640 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
641 frag->nh.raw = frag->data;
642 fh = (struct frag_hdr*)(frag->data + hlen);
643 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
644
645 /*
646 * Charge the memory for the fragment to any owner
647 * it might possess
648 */
649 if (skb->sk)
650 skb_set_owner_w(frag, skb->sk);
651
652 /*
653 * Copy the packet header into the new buffer.
654 */
655 memcpy(frag->nh.raw, skb->data, hlen);
656
657 /*
658 * Build fragment header.
659 */
660 fh->nexthdr = nexthdr;
661 fh->reserved = 0;
662 if (frag_id) {
663 ipv6_select_ident(skb, fh);
664 frag_id = fh->identification;
665 } else
666 fh->identification = frag_id;
667
668 /*
669 * Copy a block of the IP datagram.
670 */
671 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
672 BUG();
673 left -= len;
674
675 fh->frag_off = htons(offset);
676 if (left > 0)
677 fh->frag_off |= htons(IP6_MF);
678 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
679
680 ptr += len;
681 offset += len;
682
683 /*
684 * Put this fragment into the sending queue.
685 */
686
687 IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
688
689 err = output(frag);
690 if (err)
691 goto fail;
692 }
693 kfree_skb(skb);
694 IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
695 return err;
696
697fail:
698 kfree_skb(skb);
699 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
700 return err;
701}
702
703int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
704{
705 int err = 0;
706
707 *dst = NULL;
708 if (sk) {
709 struct ipv6_pinfo *np = inet6_sk(sk);
710
711 *dst = sk_dst_check(sk, np->dst_cookie);
712 if (*dst) {
713 struct rt6_info *rt = (struct rt6_info*)*dst;
714
715 /* Yes, checking route validity in not connected
716 case is not very simple. Take into account,
717 that we do not support routing by source, TOS,
718 and MSG_DONTROUTE --ANK (980726)
719
720 1. If route was host route, check that
721 cached destination is current.
722 If it is network route, we still may
723 check its validity using saved pointer
724 to the last used address: daddr_cache.
725 We do not want to save whole address now,
726 (because main consumer of this service
727 is tcp, which has not this problem),
728 so that the last trick works only on connected
729 sockets.
730 2. oif also should be the same.
731 */
732
733 if (((rt->rt6i_dst.plen != 128 ||
734 !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
735 && (np->daddr_cache == NULL ||
736 !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
737 || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
738 dst_release(*dst);
739 *dst = NULL;
740 }
741 }
742 }
743
744 if (*dst == NULL)
745 *dst = ip6_route_output(sk, fl);
746
747 if ((err = (*dst)->error))
748 goto out_err_release;
749
750 if (ipv6_addr_any(&fl->fl6_src)) {
751 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
752
Olaf Hering44456d32005-07-27 11:45:17 -0700753 if (err)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700754 goto out_err_release;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700755 }
756
757 return 0;
758
759out_err_release:
760 dst_release(*dst);
761 *dst = NULL;
762 return err;
763}
764
765int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb),
766 void *from, int length, int transhdrlen,
767 int hlimit, struct ipv6_txoptions *opt, struct flowi *fl, struct rt6_info *rt,
768 unsigned int flags)
769{
770 struct inet_sock *inet = inet_sk(sk);
771 struct ipv6_pinfo *np = inet6_sk(sk);
772 struct sk_buff *skb;
773 unsigned int maxfraglen, fragheaderlen;
774 int exthdrlen;
775 int hh_len;
776 int mtu;
777 int copy;
778 int err;
779 int offset = 0;
780 int csummode = CHECKSUM_NONE;
781
782 if (flags&MSG_PROBE)
783 return 0;
784 if (skb_queue_empty(&sk->sk_write_queue)) {
785 /*
786 * setup for corking
787 */
788 if (opt) {
789 if (np->cork.opt == NULL) {
790 np->cork.opt = kmalloc(opt->tot_len,
791 sk->sk_allocation);
792 if (unlikely(np->cork.opt == NULL))
793 return -ENOBUFS;
794 } else if (np->cork.opt->tot_len < opt->tot_len) {
795 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
796 return -EINVAL;
797 }
798 memcpy(np->cork.opt, opt, opt->tot_len);
799 inet->cork.flags |= IPCORK_OPT;
800 /* need source address above miyazawa*/
801 }
802 dst_hold(&rt->u.dst);
803 np->cork.rt = rt;
804 inet->cork.fl = *fl;
805 np->cork.hop_limit = hlimit;
806 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
807 if (dst_allfrag(rt->u.dst.path))
808 inet->cork.flags |= IPCORK_ALLFRAG;
809 inet->cork.length = 0;
810 sk->sk_sndmsg_page = NULL;
811 sk->sk_sndmsg_off = 0;
812 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
813 length += exthdrlen;
814 transhdrlen += exthdrlen;
815 } else {
816 rt = np->cork.rt;
817 fl = &inet->cork.fl;
818 if (inet->cork.flags & IPCORK_OPT)
819 opt = np->cork.opt;
820 transhdrlen = 0;
821 exthdrlen = 0;
822 mtu = inet->cork.fragsize;
823 }
824
825 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
826
827 fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
828 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
829
830 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
831 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
832 ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
833 return -EMSGSIZE;
834 }
835 }
836
837 /*
838 * Let's try using as much space as possible.
839 * Use MTU if total length of the message fits into the MTU.
840 * Otherwise, we need to reserve fragment header and
841 * fragment alignment (= 8-15 octects, in total).
842 *
843 * Note that we may need to "move" the data from the tail of
844 * of the buffer to the new fragment when we split
845 * the message.
846 *
847 * FIXME: It may be fragmented into multiple chunks
848 * at once if non-fragmentable extension headers
849 * are too large.
850 * --yoshfuji
851 */
852
853 inet->cork.length += length;
854
855 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
856 goto alloc_new_skb;
857
858 while (length > 0) {
859 /* Check if the remaining data fits into current packet. */
860 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
861 if (copy < length)
862 copy = maxfraglen - skb->len;
863
864 if (copy <= 0) {
865 char *data;
866 unsigned int datalen;
867 unsigned int fraglen;
868 unsigned int fraggap;
869 unsigned int alloclen;
870 struct sk_buff *skb_prev;
871alloc_new_skb:
872 skb_prev = skb;
873
874 /* There's no room in the current skb */
875 if (skb_prev)
876 fraggap = skb_prev->len - maxfraglen;
877 else
878 fraggap = 0;
879
880 /*
881 * If remaining data exceeds the mtu,
882 * we know we need more fragment(s).
883 */
884 datalen = length + fraggap;
885 if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
886 datalen = maxfraglen - fragheaderlen;
887
888 fraglen = datalen + fragheaderlen;
889 if ((flags & MSG_MORE) &&
890 !(rt->u.dst.dev->features&NETIF_F_SG))
891 alloclen = mtu;
892 else
893 alloclen = datalen + fragheaderlen;
894
895 /*
896 * The last fragment gets additional space at tail.
897 * Note: we overallocate on fragments with MSG_MODE
898 * because we have no idea if we're the last one.
899 */
900 if (datalen == length + fraggap)
901 alloclen += rt->u.dst.trailer_len;
902
903 /*
904 * We just reserve space for fragment header.
905 * Note: this may be overallocation if the message
906 * (without MSG_MORE) fits into the MTU.
907 */
908 alloclen += sizeof(struct frag_hdr);
909
910 if (transhdrlen) {
911 skb = sock_alloc_send_skb(sk,
912 alloclen + hh_len,
913 (flags & MSG_DONTWAIT), &err);
914 } else {
915 skb = NULL;
916 if (atomic_read(&sk->sk_wmem_alloc) <=
917 2 * sk->sk_sndbuf)
918 skb = sock_wmalloc(sk,
919 alloclen + hh_len, 1,
920 sk->sk_allocation);
921 if (unlikely(skb == NULL))
922 err = -ENOBUFS;
923 }
924 if (skb == NULL)
925 goto error;
926 /*
927 * Fill in the control structures
928 */
929 skb->ip_summed = csummode;
930 skb->csum = 0;
931 /* reserve for fragmentation */
932 skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
933
934 /*
935 * Find where to start putting bytes
936 */
937 data = skb_put(skb, fraglen);
938 skb->nh.raw = data + exthdrlen;
939 data += fragheaderlen;
940 skb->h.raw = data + exthdrlen;
941
942 if (fraggap) {
943 skb->csum = skb_copy_and_csum_bits(
944 skb_prev, maxfraglen,
945 data + transhdrlen, fraggap, 0);
946 skb_prev->csum = csum_sub(skb_prev->csum,
947 skb->csum);
948 data += fraggap;
949 skb_trim(skb_prev, maxfraglen);
950 }
951 copy = datalen - transhdrlen - fraggap;
952 if (copy < 0) {
953 err = -EINVAL;
954 kfree_skb(skb);
955 goto error;
956 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
957 err = -EFAULT;
958 kfree_skb(skb);
959 goto error;
960 }
961
962 offset += copy;
963 length -= datalen - fraggap;
964 transhdrlen = 0;
965 exthdrlen = 0;
966 csummode = CHECKSUM_NONE;
967
968 /*
969 * Put the packet on the pending queue
970 */
971 __skb_queue_tail(&sk->sk_write_queue, skb);
972 continue;
973 }
974
975 if (copy > length)
976 copy = length;
977
978 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
979 unsigned int off;
980
981 off = skb->len;
982 if (getfrag(from, skb_put(skb, copy),
983 offset, copy, off, skb) < 0) {
984 __skb_trim(skb, off);
985 err = -EFAULT;
986 goto error;
987 }
988 } else {
989 int i = skb_shinfo(skb)->nr_frags;
990 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
991 struct page *page = sk->sk_sndmsg_page;
992 int off = sk->sk_sndmsg_off;
993 unsigned int left;
994
995 if (page && (left = PAGE_SIZE - off) > 0) {
996 if (copy >= left)
997 copy = left;
998 if (page != frag->page) {
999 if (i == MAX_SKB_FRAGS) {
1000 err = -EMSGSIZE;
1001 goto error;
1002 }
1003 get_page(page);
1004 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1005 frag = &skb_shinfo(skb)->frags[i];
1006 }
1007 } else if(i < MAX_SKB_FRAGS) {
1008 if (copy > PAGE_SIZE)
1009 copy = PAGE_SIZE;
1010 page = alloc_pages(sk->sk_allocation, 0);
1011 if (page == NULL) {
1012 err = -ENOMEM;
1013 goto error;
1014 }
1015 sk->sk_sndmsg_page = page;
1016 sk->sk_sndmsg_off = 0;
1017
1018 skb_fill_page_desc(skb, i, page, 0, 0);
1019 frag = &skb_shinfo(skb)->frags[i];
1020 skb->truesize += PAGE_SIZE;
1021 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1022 } else {
1023 err = -EMSGSIZE;
1024 goto error;
1025 }
1026 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1027 err = -EFAULT;
1028 goto error;
1029 }
1030 sk->sk_sndmsg_off += copy;
1031 frag->size += copy;
1032 skb->len += copy;
1033 skb->data_len += copy;
1034 }
1035 offset += copy;
1036 length -= copy;
1037 }
1038 return 0;
1039error:
1040 inet->cork.length -= length;
1041 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1042 return err;
1043}
1044
1045int ip6_push_pending_frames(struct sock *sk)
1046{
1047 struct sk_buff *skb, *tmp_skb;
1048 struct sk_buff **tail_skb;
1049 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1050 struct inet_sock *inet = inet_sk(sk);
1051 struct ipv6_pinfo *np = inet6_sk(sk);
1052 struct ipv6hdr *hdr;
1053 struct ipv6_txoptions *opt = np->cork.opt;
1054 struct rt6_info *rt = np->cork.rt;
1055 struct flowi *fl = &inet->cork.fl;
1056 unsigned char proto = fl->proto;
1057 int err = 0;
1058
1059 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1060 goto out;
1061 tail_skb = &(skb_shinfo(skb)->frag_list);
1062
1063 /* move skb->data to ip header from ext header */
1064 if (skb->data < skb->nh.raw)
1065 __skb_pull(skb, skb->nh.raw - skb->data);
1066 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1067 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1068 *tail_skb = tmp_skb;
1069 tail_skb = &(tmp_skb->next);
1070 skb->len += tmp_skb->len;
1071 skb->data_len += tmp_skb->len;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001072 skb->truesize += tmp_skb->truesize;
1073 __sock_put(tmp_skb->sk);
1074 tmp_skb->destructor = NULL;
1075 tmp_skb->sk = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001076 }
1077
1078 ipv6_addr_copy(final_dst, &fl->fl6_dst);
1079 __skb_pull(skb, skb->h.raw - skb->nh.raw);
1080 if (opt && opt->opt_flen)
1081 ipv6_push_frag_opts(skb, opt, &proto);
1082 if (opt && opt->opt_nflen)
1083 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1084
1085 skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1086
1087 *(u32*)hdr = fl->fl6_flowlabel | htonl(0x60000000);
1088
1089 if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1090 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1091 else
1092 hdr->payload_len = 0;
1093 hdr->hop_limit = np->cork.hop_limit;
1094 hdr->nexthdr = proto;
1095 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1096 ipv6_addr_copy(&hdr->daddr, final_dst);
1097
1098 skb->dst = dst_clone(&rt->u.dst);
1099 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
1100 err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1101 if (err) {
1102 if (err > 0)
Herbert Xu3320da82005-04-19 22:32:22 -07001103 err = np->recverr ? net_xmit_errno(err) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001104 if (err)
1105 goto error;
1106 }
1107
1108out:
1109 inet->cork.flags &= ~IPCORK_OPT;
1110 if (np->cork.opt) {
1111 kfree(np->cork.opt);
1112 np->cork.opt = NULL;
1113 }
1114 if (np->cork.rt) {
1115 dst_release(&np->cork.rt->u.dst);
1116 np->cork.rt = NULL;
1117 inet->cork.flags &= ~IPCORK_ALLFRAG;
1118 }
1119 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1120 return err;
1121error:
1122 goto out;
1123}
1124
1125void ip6_flush_pending_frames(struct sock *sk)
1126{
1127 struct inet_sock *inet = inet_sk(sk);
1128 struct ipv6_pinfo *np = inet6_sk(sk);
1129 struct sk_buff *skb;
1130
1131 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1132 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1133 kfree_skb(skb);
1134 }
1135
1136 inet->cork.flags &= ~IPCORK_OPT;
1137
1138 if (np->cork.opt) {
1139 kfree(np->cork.opt);
1140 np->cork.opt = NULL;
1141 }
1142 if (np->cork.rt) {
1143 dst_release(&np->cork.rt->u.dst);
1144 np->cork.rt = NULL;
1145 inet->cork.flags &= ~IPCORK_ALLFRAG;
1146 }
1147 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1148}