blob: 2f589f24c09397fa58cffe9d54ec866b02e78598 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * IPv6 output functions
3 * Linux INET6 implementation
4 *
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
7 *
8 * $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9 *
10 * Based on linux/net/ipv4/ip_output.c
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
16 *
17 * Changes:
18 * A.N.Kuznetsov : airthmetics in fragmentation.
19 * extension headers are implemented.
20 * route changes now work.
21 * ip6_forward does not confuse sniffers.
22 * etc.
23 *
24 * H. von Brand : Added missing #include <linux/string.h>
25 * Imran Patel : frag id should be in NBO
26 * Kazunori MIYAZAWA @USAGI
27 * : add ip6_append_data and related functions
28 * for datagram xmit
29 */
30
31#include <linux/config.h>
32#include <linux/errno.h>
33#include <linux/types.h>
34#include <linux/string.h>
35#include <linux/socket.h>
36#include <linux/net.h>
37#include <linux/netdevice.h>
38#include <linux/if_arp.h>
39#include <linux/in6.h>
40#include <linux/tcp.h>
41#include <linux/route.h>
42
43#include <linux/netfilter.h>
44#include <linux/netfilter_ipv6.h>
45
46#include <net/sock.h>
47#include <net/snmp.h>
48
49#include <net/ipv6.h>
50#include <net/ndisc.h>
51#include <net/protocol.h>
52#include <net/ip6_route.h>
53#include <net/addrconf.h>
54#include <net/rawv6.h>
55#include <net/icmp.h>
56#include <net/xfrm.h>
57#include <net/checksum.h>
58
59static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62{
63 static u32 ipv6_fragmentation_id = 1;
64 static DEFINE_SPINLOCK(ip6_id_lock);
65
66 spin_lock_bh(&ip6_id_lock);
67 fhdr->identification = htonl(ipv6_fragmentation_id);
68 if (++ipv6_fragmentation_id == 0)
69 ipv6_fragmentation_id = 1;
70 spin_unlock_bh(&ip6_id_lock);
71}
72
73static inline int ip6_output_finish(struct sk_buff *skb)
74{
75
76 struct dst_entry *dst = skb->dst;
77 struct hh_cache *hh = dst->hh;
78
79 if (hh) {
80 int hh_alen;
81
82 read_lock_bh(&hh->hh_lock);
83 hh_alen = HH_DATA_ALIGN(hh->hh_len);
84 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
85 read_unlock_bh(&hh->hh_lock);
86 skb_push(skb, hh->hh_len);
87 return hh->hh_output(skb);
88 } else if (dst->neighbour)
89 return dst->neighbour->output(skb);
90
91 IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
92 kfree_skb(skb);
93 return -EINVAL;
94
95}
96
97/* dev_loopback_xmit for use with netfilter. */
98static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
99{
100 newskb->mac.raw = newskb->data;
101 __skb_pull(newskb, newskb->nh.raw - newskb->data);
102 newskb->pkt_type = PACKET_LOOPBACK;
103 newskb->ip_summed = CHECKSUM_UNNECESSARY;
104 BUG_TRAP(newskb->dst);
105
106 netif_rx(newskb);
107 return 0;
108}
109
110
111static int ip6_output2(struct sk_buff *skb)
112{
113 struct dst_entry *dst = skb->dst;
114 struct net_device *dev = dst->dev;
115
116 skb->protocol = htons(ETH_P_IPV6);
117 skb->dev = dev;
118
119 if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
120 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
121
122 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
123 ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
124 &skb->nh.ipv6h->saddr)) {
125 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
126
127 /* Do not check for IFF_ALLMULTI; multicast routing
128 is not supported in any case.
129 */
130 if (newskb)
131 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
132 newskb->dev,
133 ip6_dev_loopback_xmit);
134
135 if (skb->nh.ipv6h->hop_limit == 0) {
136 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
137 kfree_skb(skb);
138 return 0;
139 }
140 }
141
142 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
143 }
144
145 return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
146}
147
148int ip6_output(struct sk_buff *skb)
149{
150 if (skb->len > dst_mtu(skb->dst) || dst_allfrag(skb->dst))
151 return ip6_fragment(skb, ip6_output2);
152 else
153 return ip6_output2(skb);
154}
155
Linus Torvalds1da177e2005-04-16 15:20:36 -0700156/*
157 * xmit an sk_buff (used by TCP)
158 */
159
160int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
161 struct ipv6_txoptions *opt, int ipfragok)
162{
163 struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
164 struct in6_addr *first_hop = &fl->fl6_dst;
165 struct dst_entry *dst = skb->dst;
166 struct ipv6hdr *hdr;
167 u8 proto = fl->proto;
168 int seg_len = skb->len;
YOSHIFUJI Hideaki41a1f8e2005-09-08 10:19:03 +0900169 int hlimit, tclass;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700170 u32 mtu;
171
172 if (opt) {
173 int head_room;
174
175 /* First: exthdrs may take lots of space (~8K for now)
176 MAX_HEADER is not enough.
177 */
178 head_room = opt->opt_nflen + opt->opt_flen;
179 seg_len += head_room;
180 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
181
182 if (skb_headroom(skb) < head_room) {
183 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
184 kfree_skb(skb);
185 skb = skb2;
186 if (skb == NULL) {
187 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
188 return -ENOBUFS;
189 }
190 if (sk)
191 skb_set_owner_w(skb, sk);
192 }
193 if (opt->opt_flen)
194 ipv6_push_frag_opts(skb, opt, &proto);
195 if (opt->opt_nflen)
196 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
197 }
198
199 hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
200
201 /*
202 * Fill in the IPv6 header
203 */
204
Linus Torvalds1da177e2005-04-16 15:20:36 -0700205 hlimit = -1;
206 if (np)
207 hlimit = np->hop_limit;
208 if (hlimit < 0)
209 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
210 if (hlimit < 0)
211 hlimit = ipv6_get_hoplimit(dst->dev);
212
YOSHIFUJI Hideaki41a1f8e2005-09-08 10:19:03 +0900213 tclass = -1;
214 if (np)
215 tclass = np->tclass;
216 if (tclass < 0)
217 tclass = 0;
218
219 *(u32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
220
Linus Torvalds1da177e2005-04-16 15:20:36 -0700221 hdr->payload_len = htons(seg_len);
222 hdr->nexthdr = proto;
223 hdr->hop_limit = hlimit;
224
225 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
226 ipv6_addr_copy(&hdr->daddr, first_hop);
227
228 mtu = dst_mtu(dst);
229 if ((skb->len <= mtu) || ipfragok) {
230 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
Harald Welte6869c4d2005-08-09 19:24:19 -0700231 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev,
232 dst_output);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700233 }
234
235 if (net_ratelimit())
236 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
237 skb->dev = dst->dev;
238 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
239 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
240 kfree_skb(skb);
241 return -EMSGSIZE;
242}
243
244/*
245 * To avoid extra problems ND packets are send through this
246 * routine. It's code duplication but I really want to avoid
247 * extra checks since ipv6_build_header is used by TCP (which
248 * is for us performance critical)
249 */
250
251int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
252 struct in6_addr *saddr, struct in6_addr *daddr,
253 int proto, int len)
254{
255 struct ipv6_pinfo *np = inet6_sk(sk);
256 struct ipv6hdr *hdr;
257 int totlen;
258
259 skb->protocol = htons(ETH_P_IPV6);
260 skb->dev = dev;
261
262 totlen = len + sizeof(struct ipv6hdr);
263
264 hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
265 skb->nh.ipv6h = hdr;
266
267 *(u32*)hdr = htonl(0x60000000);
268
269 hdr->payload_len = htons(len);
270 hdr->nexthdr = proto;
271 hdr->hop_limit = np->hop_limit;
272
273 ipv6_addr_copy(&hdr->saddr, saddr);
274 ipv6_addr_copy(&hdr->daddr, daddr);
275
276 return 0;
277}
278
279static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
280{
281 struct ip6_ra_chain *ra;
282 struct sock *last = NULL;
283
284 read_lock(&ip6_ra_lock);
285 for (ra = ip6_ra_chain; ra; ra = ra->next) {
286 struct sock *sk = ra->sk;
Andrew McDonald0bd1b592005-08-09 19:44:42 -0700287 if (sk && ra->sel == sel &&
288 (!sk->sk_bound_dev_if ||
289 sk->sk_bound_dev_if == skb->dev->ifindex)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700290 if (last) {
291 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
292 if (skb2)
293 rawv6_rcv(last, skb2);
294 }
295 last = sk;
296 }
297 }
298
299 if (last) {
300 rawv6_rcv(last, skb);
301 read_unlock(&ip6_ra_lock);
302 return 1;
303 }
304 read_unlock(&ip6_ra_lock);
305 return 0;
306}
307
308static inline int ip6_forward_finish(struct sk_buff *skb)
309{
310 return dst_output(skb);
311}
312
313int ip6_forward(struct sk_buff *skb)
314{
315 struct dst_entry *dst = skb->dst;
316 struct ipv6hdr *hdr = skb->nh.ipv6h;
317 struct inet6_skb_parm *opt = IP6CB(skb);
318
319 if (ipv6_devconf.forwarding == 0)
320 goto error;
321
322 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
323 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
324 goto drop;
325 }
326
327 skb->ip_summed = CHECKSUM_NONE;
328
329 /*
330 * We DO NOT make any processing on
331 * RA packets, pushing them to user level AS IS
332 * without ane WARRANTY that application will be able
333 * to interpret them. The reason is that we
334 * cannot make anything clever here.
335 *
336 * We are not end-node, so that if packet contains
337 * AH/ESP, we cannot make anything.
338 * Defragmentation also would be mistake, RA packets
339 * cannot be fragmented, because there is no warranty
340 * that different fragments will go along one path. --ANK
341 */
342 if (opt->ra) {
343 u8 *ptr = skb->nh.raw + opt->ra;
344 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
345 return 0;
346 }
347
348 /*
349 * check and decrement ttl
350 */
351 if (hdr->hop_limit <= 1) {
352 /* Force OUTPUT device used as source address */
353 skb->dev = dst->dev;
354 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
355 0, skb->dev);
356
357 kfree_skb(skb);
358 return -ETIMEDOUT;
359 }
360
361 if (!xfrm6_route_forward(skb)) {
362 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
363 goto drop;
364 }
365 dst = skb->dst;
366
367 /* IPv6 specs say nothing about it, but it is clear that we cannot
368 send redirects to source routed frames.
369 */
370 if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
371 struct in6_addr *target = NULL;
372 struct rt6_info *rt;
373 struct neighbour *n = dst->neighbour;
374
375 /*
376 * incoming and outgoing devices are the same
377 * send a redirect.
378 */
379
380 rt = (struct rt6_info *) dst;
381 if ((rt->rt6i_flags & RTF_GATEWAY))
382 target = (struct in6_addr*)&n->primary_key;
383 else
384 target = &hdr->daddr;
385
386 /* Limit redirects both by destination (here)
387 and by source (inside ndisc_send_redirect)
388 */
389 if (xrlim_allow(dst, 1*HZ))
390 ndisc_send_redirect(skb, n, target);
391 } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
392 |IPV6_ADDR_LINKLOCAL)) {
393 /* This check is security critical. */
394 goto error;
395 }
396
397 if (skb->len > dst_mtu(dst)) {
398 /* Again, force OUTPUT device used as source address */
399 skb->dev = dst->dev;
400 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
401 IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
402 IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
403 kfree_skb(skb);
404 return -EMSGSIZE;
405 }
406
407 if (skb_cow(skb, dst->dev->hard_header_len)) {
408 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
409 goto drop;
410 }
411
412 hdr = skb->nh.ipv6h;
413
414 /* Mangling hops number delayed to point after skb COW */
415
416 hdr->hop_limit--;
417
418 IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
419 return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
420
421error:
422 IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
423drop:
424 kfree_skb(skb);
425 return -EINVAL;
426}
427
428static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
429{
430 to->pkt_type = from->pkt_type;
431 to->priority = from->priority;
432 to->protocol = from->protocol;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700433 dst_release(to->dst);
434 to->dst = dst_clone(from->dst);
435 to->dev = from->dev;
436
437#ifdef CONFIG_NET_SCHED
438 to->tc_index = from->tc_index;
439#endif
440#ifdef CONFIG_NETFILTER
441 to->nfmark = from->nfmark;
442 /* Connection association is same as pre-frag packet */
443 to->nfct = from->nfct;
444 nf_conntrack_get(to->nfct);
445 to->nfctinfo = from->nfctinfo;
446#ifdef CONFIG_BRIDGE_NETFILTER
447 nf_bridge_put(to->nf_bridge);
448 to->nf_bridge = from->nf_bridge;
449 nf_bridge_get(to->nf_bridge);
450#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700451#endif
452}
453
454int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
455{
456 u16 offset = sizeof(struct ipv6hdr);
457 struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
458 unsigned int packet_len = skb->tail - skb->nh.raw;
459 int found_rhdr = 0;
460 *nexthdr = &skb->nh.ipv6h->nexthdr;
461
462 while (offset + 1 <= packet_len) {
463
464 switch (**nexthdr) {
465
466 case NEXTHDR_HOP:
467 case NEXTHDR_ROUTING:
468 case NEXTHDR_DEST:
469 if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
470 if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
471 offset += ipv6_optlen(exthdr);
472 *nexthdr = &exthdr->nexthdr;
473 exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
474 break;
475 default :
476 return offset;
477 }
478 }
479
480 return offset;
481}
482
483static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
484{
485 struct net_device *dev;
486 struct sk_buff *frag;
487 struct rt6_info *rt = (struct rt6_info*)skb->dst;
488 struct ipv6hdr *tmp_hdr;
489 struct frag_hdr *fh;
490 unsigned int mtu, hlen, left, len;
491 u32 frag_id = 0;
492 int ptr, offset = 0, err=0;
493 u8 *prevhdr, nexthdr = 0;
494
495 dev = rt->u.dst.dev;
496 hlen = ip6_find_1stfragopt(skb, &prevhdr);
497 nexthdr = *prevhdr;
498
499 mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
500
501 if (skb_shinfo(skb)->frag_list) {
502 int first_len = skb_pagelen(skb);
503
504 if (first_len - hlen > mtu ||
505 ((first_len - hlen) & 7) ||
506 skb_cloned(skb))
507 goto slow_path;
508
509 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
510 /* Correct geometry. */
511 if (frag->len > mtu ||
512 ((frag->len & 7) && frag->next) ||
513 skb_headroom(frag) < hlen)
514 goto slow_path;
515
Linus Torvalds1da177e2005-04-16 15:20:36 -0700516 /* Partially cloned skb? */
517 if (skb_shared(frag))
518 goto slow_path;
Herbert Xu2fdba6b2005-05-18 22:52:33 -0700519
520 BUG_ON(frag->sk);
521 if (skb->sk) {
522 sock_hold(skb->sk);
523 frag->sk = skb->sk;
524 frag->destructor = sock_wfree;
525 skb->truesize -= frag->truesize;
526 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700527 }
528
529 err = 0;
530 offset = 0;
531 frag = skb_shinfo(skb)->frag_list;
532 skb_shinfo(skb)->frag_list = NULL;
533 /* BUILD HEADER */
534
535 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
536 if (!tmp_hdr) {
537 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
538 return -ENOMEM;
539 }
540
541 *prevhdr = NEXTHDR_FRAGMENT;
542 memcpy(tmp_hdr, skb->nh.raw, hlen);
543 __skb_pull(skb, hlen);
544 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
545 skb->nh.raw = __skb_push(skb, hlen);
546 memcpy(skb->nh.raw, tmp_hdr, hlen);
547
548 ipv6_select_ident(skb, fh);
549 fh->nexthdr = nexthdr;
550 fh->reserved = 0;
551 fh->frag_off = htons(IP6_MF);
552 frag_id = fh->identification;
553
554 first_len = skb_pagelen(skb);
555 skb->data_len = first_len - skb_headlen(skb);
556 skb->len = first_len;
557 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
558
559
560 for (;;) {
561 /* Prepare header of the next frame,
562 * before previous one went down. */
563 if (frag) {
564 frag->ip_summed = CHECKSUM_NONE;
565 frag->h.raw = frag->data;
566 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
567 frag->nh.raw = __skb_push(frag, hlen);
568 memcpy(frag->nh.raw, tmp_hdr, hlen);
569 offset += skb->len - hlen - sizeof(struct frag_hdr);
570 fh->nexthdr = nexthdr;
571 fh->reserved = 0;
572 fh->frag_off = htons(offset);
573 if (frag->next != NULL)
574 fh->frag_off |= htons(IP6_MF);
575 fh->identification = frag_id;
576 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
577 ip6_copy_metadata(frag, skb);
578 }
579
580 err = output(skb);
581 if (err || !frag)
582 break;
583
584 skb = frag;
585 frag = skb->next;
586 skb->next = NULL;
587 }
588
589 if (tmp_hdr)
590 kfree(tmp_hdr);
591
592 if (err == 0) {
593 IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
594 return 0;
595 }
596
597 while (frag) {
598 skb = frag->next;
599 kfree_skb(frag);
600 frag = skb;
601 }
602
603 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
604 return err;
605 }
606
607slow_path:
608 left = skb->len - hlen; /* Space per frame */
609 ptr = hlen; /* Where to start from */
610
611 /*
612 * Fragment the datagram.
613 */
614
615 *prevhdr = NEXTHDR_FRAGMENT;
616
617 /*
618 * Keep copying data until we run out.
619 */
620 while(left > 0) {
621 len = left;
622 /* IF: it doesn't fit, use 'mtu' - the data space left */
623 if (len > mtu)
624 len = mtu;
625 /* IF: we are not sending upto and including the packet end
626 then align the next start on an eight byte boundary */
627 if (len < left) {
628 len &= ~7;
629 }
630 /*
631 * Allocate buffer.
632 */
633
634 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
Patrick McHardy64ce2072005-08-09 20:50:53 -0700635 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700636 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
637 err = -ENOMEM;
638 goto fail;
639 }
640
641 /*
642 * Set up data on packet
643 */
644
645 ip6_copy_metadata(frag, skb);
646 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
647 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
648 frag->nh.raw = frag->data;
649 fh = (struct frag_hdr*)(frag->data + hlen);
650 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
651
652 /*
653 * Charge the memory for the fragment to any owner
654 * it might possess
655 */
656 if (skb->sk)
657 skb_set_owner_w(frag, skb->sk);
658
659 /*
660 * Copy the packet header into the new buffer.
661 */
662 memcpy(frag->nh.raw, skb->data, hlen);
663
664 /*
665 * Build fragment header.
666 */
667 fh->nexthdr = nexthdr;
668 fh->reserved = 0;
669 if (frag_id) {
670 ipv6_select_ident(skb, fh);
671 frag_id = fh->identification;
672 } else
673 fh->identification = frag_id;
674
675 /*
676 * Copy a block of the IP datagram.
677 */
678 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
679 BUG();
680 left -= len;
681
682 fh->frag_off = htons(offset);
683 if (left > 0)
684 fh->frag_off |= htons(IP6_MF);
685 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
686
687 ptr += len;
688 offset += len;
689
690 /*
691 * Put this fragment into the sending queue.
692 */
693
694 IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
695
696 err = output(frag);
697 if (err)
698 goto fail;
699 }
700 kfree_skb(skb);
701 IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
702 return err;
703
704fail:
705 kfree_skb(skb);
706 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
707 return err;
708}
709
710int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
711{
712 int err = 0;
713
714 *dst = NULL;
715 if (sk) {
716 struct ipv6_pinfo *np = inet6_sk(sk);
717
718 *dst = sk_dst_check(sk, np->dst_cookie);
719 if (*dst) {
720 struct rt6_info *rt = (struct rt6_info*)*dst;
721
722 /* Yes, checking route validity in not connected
723 case is not very simple. Take into account,
724 that we do not support routing by source, TOS,
725 and MSG_DONTROUTE --ANK (980726)
726
727 1. If route was host route, check that
728 cached destination is current.
729 If it is network route, we still may
730 check its validity using saved pointer
731 to the last used address: daddr_cache.
732 We do not want to save whole address now,
733 (because main consumer of this service
734 is tcp, which has not this problem),
735 so that the last trick works only on connected
736 sockets.
737 2. oif also should be the same.
738 */
739
740 if (((rt->rt6i_dst.plen != 128 ||
741 !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
742 && (np->daddr_cache == NULL ||
743 !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
744 || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
745 dst_release(*dst);
746 *dst = NULL;
747 }
748 }
749 }
750
751 if (*dst == NULL)
752 *dst = ip6_route_output(sk, fl);
753
754 if ((err = (*dst)->error))
755 goto out_err_release;
756
757 if (ipv6_addr_any(&fl->fl6_src)) {
758 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
759
Olaf Hering44456d32005-07-27 11:45:17 -0700760 if (err)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700761 goto out_err_release;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700762 }
763
764 return 0;
765
766out_err_release:
767 dst_release(*dst);
768 *dst = NULL;
769 return err;
770}
771
YOSHIFUJI Hideaki41a1f8e2005-09-08 10:19:03 +0900772int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
773 int offset, int len, int odd, struct sk_buff *skb),
774 void *from, int length, int transhdrlen,
775 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
776 struct rt6_info *rt, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700777{
778 struct inet_sock *inet = inet_sk(sk);
779 struct ipv6_pinfo *np = inet6_sk(sk);
780 struct sk_buff *skb;
781 unsigned int maxfraglen, fragheaderlen;
782 int exthdrlen;
783 int hh_len;
784 int mtu;
785 int copy;
786 int err;
787 int offset = 0;
788 int csummode = CHECKSUM_NONE;
789
790 if (flags&MSG_PROBE)
791 return 0;
792 if (skb_queue_empty(&sk->sk_write_queue)) {
793 /*
794 * setup for corking
795 */
796 if (opt) {
797 if (np->cork.opt == NULL) {
798 np->cork.opt = kmalloc(opt->tot_len,
799 sk->sk_allocation);
800 if (unlikely(np->cork.opt == NULL))
801 return -ENOBUFS;
802 } else if (np->cork.opt->tot_len < opt->tot_len) {
803 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
804 return -EINVAL;
805 }
806 memcpy(np->cork.opt, opt, opt->tot_len);
807 inet->cork.flags |= IPCORK_OPT;
808 /* need source address above miyazawa*/
809 }
810 dst_hold(&rt->u.dst);
811 np->cork.rt = rt;
812 inet->cork.fl = *fl;
813 np->cork.hop_limit = hlimit;
YOSHIFUJI Hideaki41a1f8e2005-09-08 10:19:03 +0900814 np->cork.tclass = tclass;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700815 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
816 if (dst_allfrag(rt->u.dst.path))
817 inet->cork.flags |= IPCORK_ALLFRAG;
818 inet->cork.length = 0;
819 sk->sk_sndmsg_page = NULL;
820 sk->sk_sndmsg_off = 0;
821 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
822 length += exthdrlen;
823 transhdrlen += exthdrlen;
824 } else {
825 rt = np->cork.rt;
826 fl = &inet->cork.fl;
827 if (inet->cork.flags & IPCORK_OPT)
828 opt = np->cork.opt;
829 transhdrlen = 0;
830 exthdrlen = 0;
831 mtu = inet->cork.fragsize;
832 }
833
834 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
835
836 fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
837 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
838
839 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
840 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
841 ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
842 return -EMSGSIZE;
843 }
844 }
845
846 /*
847 * Let's try using as much space as possible.
848 * Use MTU if total length of the message fits into the MTU.
849 * Otherwise, we need to reserve fragment header and
850 * fragment alignment (= 8-15 octects, in total).
851 *
852 * Note that we may need to "move" the data from the tail of
853 * of the buffer to the new fragment when we split
854 * the message.
855 *
856 * FIXME: It may be fragmented into multiple chunks
857 * at once if non-fragmentable extension headers
858 * are too large.
859 * --yoshfuji
860 */
861
862 inet->cork.length += length;
863
864 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
865 goto alloc_new_skb;
866
867 while (length > 0) {
868 /* Check if the remaining data fits into current packet. */
869 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
870 if (copy < length)
871 copy = maxfraglen - skb->len;
872
873 if (copy <= 0) {
874 char *data;
875 unsigned int datalen;
876 unsigned int fraglen;
877 unsigned int fraggap;
878 unsigned int alloclen;
879 struct sk_buff *skb_prev;
880alloc_new_skb:
881 skb_prev = skb;
882
883 /* There's no room in the current skb */
884 if (skb_prev)
885 fraggap = skb_prev->len - maxfraglen;
886 else
887 fraggap = 0;
888
889 /*
890 * If remaining data exceeds the mtu,
891 * we know we need more fragment(s).
892 */
893 datalen = length + fraggap;
894 if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
895 datalen = maxfraglen - fragheaderlen;
896
897 fraglen = datalen + fragheaderlen;
898 if ((flags & MSG_MORE) &&
899 !(rt->u.dst.dev->features&NETIF_F_SG))
900 alloclen = mtu;
901 else
902 alloclen = datalen + fragheaderlen;
903
904 /*
905 * The last fragment gets additional space at tail.
906 * Note: we overallocate on fragments with MSG_MODE
907 * because we have no idea if we're the last one.
908 */
909 if (datalen == length + fraggap)
910 alloclen += rt->u.dst.trailer_len;
911
912 /*
913 * We just reserve space for fragment header.
914 * Note: this may be overallocation if the message
915 * (without MSG_MORE) fits into the MTU.
916 */
917 alloclen += sizeof(struct frag_hdr);
918
919 if (transhdrlen) {
920 skb = sock_alloc_send_skb(sk,
921 alloclen + hh_len,
922 (flags & MSG_DONTWAIT), &err);
923 } else {
924 skb = NULL;
925 if (atomic_read(&sk->sk_wmem_alloc) <=
926 2 * sk->sk_sndbuf)
927 skb = sock_wmalloc(sk,
928 alloclen + hh_len, 1,
929 sk->sk_allocation);
930 if (unlikely(skb == NULL))
931 err = -ENOBUFS;
932 }
933 if (skb == NULL)
934 goto error;
935 /*
936 * Fill in the control structures
937 */
938 skb->ip_summed = csummode;
939 skb->csum = 0;
940 /* reserve for fragmentation */
941 skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
942
943 /*
944 * Find where to start putting bytes
945 */
946 data = skb_put(skb, fraglen);
947 skb->nh.raw = data + exthdrlen;
948 data += fragheaderlen;
949 skb->h.raw = data + exthdrlen;
950
951 if (fraggap) {
952 skb->csum = skb_copy_and_csum_bits(
953 skb_prev, maxfraglen,
954 data + transhdrlen, fraggap, 0);
955 skb_prev->csum = csum_sub(skb_prev->csum,
956 skb->csum);
957 data += fraggap;
958 skb_trim(skb_prev, maxfraglen);
959 }
960 copy = datalen - transhdrlen - fraggap;
961 if (copy < 0) {
962 err = -EINVAL;
963 kfree_skb(skb);
964 goto error;
965 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
966 err = -EFAULT;
967 kfree_skb(skb);
968 goto error;
969 }
970
971 offset += copy;
972 length -= datalen - fraggap;
973 transhdrlen = 0;
974 exthdrlen = 0;
975 csummode = CHECKSUM_NONE;
976
977 /*
978 * Put the packet on the pending queue
979 */
980 __skb_queue_tail(&sk->sk_write_queue, skb);
981 continue;
982 }
983
984 if (copy > length)
985 copy = length;
986
987 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
988 unsigned int off;
989
990 off = skb->len;
991 if (getfrag(from, skb_put(skb, copy),
992 offset, copy, off, skb) < 0) {
993 __skb_trim(skb, off);
994 err = -EFAULT;
995 goto error;
996 }
997 } else {
998 int i = skb_shinfo(skb)->nr_frags;
999 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1000 struct page *page = sk->sk_sndmsg_page;
1001 int off = sk->sk_sndmsg_off;
1002 unsigned int left;
1003
1004 if (page && (left = PAGE_SIZE - off) > 0) {
1005 if (copy >= left)
1006 copy = left;
1007 if (page != frag->page) {
1008 if (i == MAX_SKB_FRAGS) {
1009 err = -EMSGSIZE;
1010 goto error;
1011 }
1012 get_page(page);
1013 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1014 frag = &skb_shinfo(skb)->frags[i];
1015 }
1016 } else if(i < MAX_SKB_FRAGS) {
1017 if (copy > PAGE_SIZE)
1018 copy = PAGE_SIZE;
1019 page = alloc_pages(sk->sk_allocation, 0);
1020 if (page == NULL) {
1021 err = -ENOMEM;
1022 goto error;
1023 }
1024 sk->sk_sndmsg_page = page;
1025 sk->sk_sndmsg_off = 0;
1026
1027 skb_fill_page_desc(skb, i, page, 0, 0);
1028 frag = &skb_shinfo(skb)->frags[i];
1029 skb->truesize += PAGE_SIZE;
1030 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1031 } else {
1032 err = -EMSGSIZE;
1033 goto error;
1034 }
1035 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1036 err = -EFAULT;
1037 goto error;
1038 }
1039 sk->sk_sndmsg_off += copy;
1040 frag->size += copy;
1041 skb->len += copy;
1042 skb->data_len += copy;
1043 }
1044 offset += copy;
1045 length -= copy;
1046 }
1047 return 0;
1048error:
1049 inet->cork.length -= length;
1050 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1051 return err;
1052}
1053
1054int ip6_push_pending_frames(struct sock *sk)
1055{
1056 struct sk_buff *skb, *tmp_skb;
1057 struct sk_buff **tail_skb;
1058 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1059 struct inet_sock *inet = inet_sk(sk);
1060 struct ipv6_pinfo *np = inet6_sk(sk);
1061 struct ipv6hdr *hdr;
1062 struct ipv6_txoptions *opt = np->cork.opt;
1063 struct rt6_info *rt = np->cork.rt;
1064 struct flowi *fl = &inet->cork.fl;
1065 unsigned char proto = fl->proto;
1066 int err = 0;
1067
1068 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1069 goto out;
1070 tail_skb = &(skb_shinfo(skb)->frag_list);
1071
1072 /* move skb->data to ip header from ext header */
1073 if (skb->data < skb->nh.raw)
1074 __skb_pull(skb, skb->nh.raw - skb->data);
1075 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1076 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1077 *tail_skb = tmp_skb;
1078 tail_skb = &(tmp_skb->next);
1079 skb->len += tmp_skb->len;
1080 skb->data_len += tmp_skb->len;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001081 skb->truesize += tmp_skb->truesize;
1082 __sock_put(tmp_skb->sk);
1083 tmp_skb->destructor = NULL;
1084 tmp_skb->sk = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001085 }
1086
1087 ipv6_addr_copy(final_dst, &fl->fl6_dst);
1088 __skb_pull(skb, skb->h.raw - skb->nh.raw);
1089 if (opt && opt->opt_flen)
1090 ipv6_push_frag_opts(skb, opt, &proto);
1091 if (opt && opt->opt_nflen)
1092 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1093
1094 skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1095
YOSHIFUJI Hideaki41a1f8e2005-09-08 10:19:03 +09001096 *(u32*)hdr = fl->fl6_flowlabel |
1097 htonl(0x60000000 | ((int)np->cork.tclass << 20));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001098
1099 if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1100 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1101 else
1102 hdr->payload_len = 0;
1103 hdr->hop_limit = np->cork.hop_limit;
1104 hdr->nexthdr = proto;
1105 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1106 ipv6_addr_copy(&hdr->daddr, final_dst);
1107
1108 skb->dst = dst_clone(&rt->u.dst);
1109 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
1110 err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1111 if (err) {
1112 if (err > 0)
Herbert Xu3320da82005-04-19 22:32:22 -07001113 err = np->recverr ? net_xmit_errno(err) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001114 if (err)
1115 goto error;
1116 }
1117
1118out:
1119 inet->cork.flags &= ~IPCORK_OPT;
1120 if (np->cork.opt) {
1121 kfree(np->cork.opt);
1122 np->cork.opt = NULL;
1123 }
1124 if (np->cork.rt) {
1125 dst_release(&np->cork.rt->u.dst);
1126 np->cork.rt = NULL;
1127 inet->cork.flags &= ~IPCORK_ALLFRAG;
1128 }
1129 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1130 return err;
1131error:
1132 goto out;
1133}
1134
1135void ip6_flush_pending_frames(struct sock *sk)
1136{
1137 struct inet_sock *inet = inet_sk(sk);
1138 struct ipv6_pinfo *np = inet6_sk(sk);
1139 struct sk_buff *skb;
1140
1141 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1142 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1143 kfree_skb(skb);
1144 }
1145
1146 inet->cork.flags &= ~IPCORK_OPT;
1147
1148 if (np->cork.opt) {
1149 kfree(np->cork.opt);
1150 np->cork.opt = NULL;
1151 }
1152 if (np->cork.rt) {
1153 dst_release(&np->cork.rt->u.dst);
1154 np->cork.rt = NULL;
1155 inet->cork.flags &= ~IPCORK_ALLFRAG;
1156 }
1157 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1158}