blob: efa3e72cfcfa3d536a425d9c466026c46acf5113 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * IPv6 output functions
3 * Linux INET6 implementation
4 *
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
7 *
8 * $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9 *
10 * Based on linux/net/ipv4/ip_output.c
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
16 *
17 * Changes:
18 * A.N.Kuznetsov : airthmetics in fragmentation.
19 * extension headers are implemented.
20 * route changes now work.
21 * ip6_forward does not confuse sniffers.
22 * etc.
23 *
24 * H. von Brand : Added missing #include <linux/string.h>
25 * Imran Patel : frag id should be in NBO
26 * Kazunori MIYAZAWA @USAGI
27 * : add ip6_append_data and related functions
28 * for datagram xmit
29 */
30
31#include <linux/config.h>
32#include <linux/errno.h>
33#include <linux/types.h>
34#include <linux/string.h>
35#include <linux/socket.h>
36#include <linux/net.h>
37#include <linux/netdevice.h>
38#include <linux/if_arp.h>
39#include <linux/in6.h>
40#include <linux/tcp.h>
41#include <linux/route.h>
42
43#include <linux/netfilter.h>
44#include <linux/netfilter_ipv6.h>
45
46#include <net/sock.h>
47#include <net/snmp.h>
48
49#include <net/ipv6.h>
50#include <net/ndisc.h>
51#include <net/protocol.h>
52#include <net/ip6_route.h>
53#include <net/addrconf.h>
54#include <net/rawv6.h>
55#include <net/icmp.h>
56#include <net/xfrm.h>
57#include <net/checksum.h>
58
59static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62{
63 static u32 ipv6_fragmentation_id = 1;
64 static DEFINE_SPINLOCK(ip6_id_lock);
65
66 spin_lock_bh(&ip6_id_lock);
67 fhdr->identification = htonl(ipv6_fragmentation_id);
68 if (++ipv6_fragmentation_id == 0)
69 ipv6_fragmentation_id = 1;
70 spin_unlock_bh(&ip6_id_lock);
71}
72
73static inline int ip6_output_finish(struct sk_buff *skb)
74{
75
76 struct dst_entry *dst = skb->dst;
77 struct hh_cache *hh = dst->hh;
78
79 if (hh) {
80 int hh_alen;
81
82 read_lock_bh(&hh->hh_lock);
83 hh_alen = HH_DATA_ALIGN(hh->hh_len);
84 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
85 read_unlock_bh(&hh->hh_lock);
86 skb_push(skb, hh->hh_len);
87 return hh->hh_output(skb);
88 } else if (dst->neighbour)
89 return dst->neighbour->output(skb);
90
91 IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
92 kfree_skb(skb);
93 return -EINVAL;
94
95}
96
97/* dev_loopback_xmit for use with netfilter. */
98static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
99{
100 newskb->mac.raw = newskb->data;
101 __skb_pull(newskb, newskb->nh.raw - newskb->data);
102 newskb->pkt_type = PACKET_LOOPBACK;
103 newskb->ip_summed = CHECKSUM_UNNECESSARY;
104 BUG_TRAP(newskb->dst);
105
106 netif_rx(newskb);
107 return 0;
108}
109
110
111static int ip6_output2(struct sk_buff *skb)
112{
113 struct dst_entry *dst = skb->dst;
114 struct net_device *dev = dst->dev;
115
116 skb->protocol = htons(ETH_P_IPV6);
117 skb->dev = dev;
118
119 if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
120 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
121
122 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
123 ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
124 &skb->nh.ipv6h->saddr)) {
125 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
126
127 /* Do not check for IFF_ALLMULTI; multicast routing
128 is not supported in any case.
129 */
130 if (newskb)
131 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
132 newskb->dev,
133 ip6_dev_loopback_xmit);
134
135 if (skb->nh.ipv6h->hop_limit == 0) {
136 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
137 kfree_skb(skb);
138 return 0;
139 }
140 }
141
142 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
143 }
144
145 return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
146}
147
148int ip6_output(struct sk_buff *skb)
149{
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700150 if ((skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->ufo_size) ||
151 dst_allfrag(skb->dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700152 return ip6_fragment(skb, ip6_output2);
153 else
154 return ip6_output2(skb);
155}
156
Linus Torvalds1da177e2005-04-16 15:20:36 -0700157/*
158 * xmit an sk_buff (used by TCP)
159 */
160
161int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
162 struct ipv6_txoptions *opt, int ipfragok)
163{
164 struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
165 struct in6_addr *first_hop = &fl->fl6_dst;
166 struct dst_entry *dst = skb->dst;
167 struct ipv6hdr *hdr;
168 u8 proto = fl->proto;
169 int seg_len = skb->len;
YOSHIFUJI Hideaki41a1f8e2005-09-08 10:19:03 +0900170 int hlimit, tclass;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700171 u32 mtu;
172
173 if (opt) {
174 int head_room;
175
176 /* First: exthdrs may take lots of space (~8K for now)
177 MAX_HEADER is not enough.
178 */
179 head_room = opt->opt_nflen + opt->opt_flen;
180 seg_len += head_room;
181 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
182
183 if (skb_headroom(skb) < head_room) {
184 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
185 kfree_skb(skb);
186 skb = skb2;
187 if (skb == NULL) {
188 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
189 return -ENOBUFS;
190 }
191 if (sk)
192 skb_set_owner_w(skb, sk);
193 }
194 if (opt->opt_flen)
195 ipv6_push_frag_opts(skb, opt, &proto);
196 if (opt->opt_nflen)
197 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
198 }
199
200 hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
201
202 /*
203 * Fill in the IPv6 header
204 */
205
Linus Torvalds1da177e2005-04-16 15:20:36 -0700206 hlimit = -1;
207 if (np)
208 hlimit = np->hop_limit;
209 if (hlimit < 0)
210 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
211 if (hlimit < 0)
212 hlimit = ipv6_get_hoplimit(dst->dev);
213
YOSHIFUJI Hideaki41a1f8e2005-09-08 10:19:03 +0900214 tclass = -1;
215 if (np)
216 tclass = np->tclass;
217 if (tclass < 0)
218 tclass = 0;
219
220 *(u32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
221
Linus Torvalds1da177e2005-04-16 15:20:36 -0700222 hdr->payload_len = htons(seg_len);
223 hdr->nexthdr = proto;
224 hdr->hop_limit = hlimit;
225
226 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
227 ipv6_addr_copy(&hdr->daddr, first_hop);
228
Patrick McHardya2c20642006-01-08 22:37:26 -0800229 skb->priority = sk->sk_priority;
230
Linus Torvalds1da177e2005-04-16 15:20:36 -0700231 mtu = dst_mtu(dst);
232 if ((skb->len <= mtu) || ipfragok) {
233 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
Harald Welte6869c4d2005-08-09 19:24:19 -0700234 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev,
235 dst_output);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700236 }
237
238 if (net_ratelimit())
239 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
240 skb->dev = dst->dev;
241 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
242 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
243 kfree_skb(skb);
244 return -EMSGSIZE;
245}
246
247/*
248 * To avoid extra problems ND packets are send through this
249 * routine. It's code duplication but I really want to avoid
250 * extra checks since ipv6_build_header is used by TCP (which
251 * is for us performance critical)
252 */
253
254int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
255 struct in6_addr *saddr, struct in6_addr *daddr,
256 int proto, int len)
257{
258 struct ipv6_pinfo *np = inet6_sk(sk);
259 struct ipv6hdr *hdr;
260 int totlen;
261
262 skb->protocol = htons(ETH_P_IPV6);
263 skb->dev = dev;
264
265 totlen = len + sizeof(struct ipv6hdr);
266
267 hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
268 skb->nh.ipv6h = hdr;
269
270 *(u32*)hdr = htonl(0x60000000);
271
272 hdr->payload_len = htons(len);
273 hdr->nexthdr = proto;
274 hdr->hop_limit = np->hop_limit;
275
276 ipv6_addr_copy(&hdr->saddr, saddr);
277 ipv6_addr_copy(&hdr->daddr, daddr);
278
279 return 0;
280}
281
282static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
283{
284 struct ip6_ra_chain *ra;
285 struct sock *last = NULL;
286
287 read_lock(&ip6_ra_lock);
288 for (ra = ip6_ra_chain; ra; ra = ra->next) {
289 struct sock *sk = ra->sk;
Andrew McDonald0bd1b592005-08-09 19:44:42 -0700290 if (sk && ra->sel == sel &&
291 (!sk->sk_bound_dev_if ||
292 sk->sk_bound_dev_if == skb->dev->ifindex)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700293 if (last) {
294 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
295 if (skb2)
296 rawv6_rcv(last, skb2);
297 }
298 last = sk;
299 }
300 }
301
302 if (last) {
303 rawv6_rcv(last, skb);
304 read_unlock(&ip6_ra_lock);
305 return 1;
306 }
307 read_unlock(&ip6_ra_lock);
308 return 0;
309}
310
311static inline int ip6_forward_finish(struct sk_buff *skb)
312{
313 return dst_output(skb);
314}
315
316int ip6_forward(struct sk_buff *skb)
317{
318 struct dst_entry *dst = skb->dst;
319 struct ipv6hdr *hdr = skb->nh.ipv6h;
320 struct inet6_skb_parm *opt = IP6CB(skb);
321
322 if (ipv6_devconf.forwarding == 0)
323 goto error;
324
325 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
326 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
327 goto drop;
328 }
329
330 skb->ip_summed = CHECKSUM_NONE;
331
332 /*
333 * We DO NOT make any processing on
334 * RA packets, pushing them to user level AS IS
335 * without ane WARRANTY that application will be able
336 * to interpret them. The reason is that we
337 * cannot make anything clever here.
338 *
339 * We are not end-node, so that if packet contains
340 * AH/ESP, we cannot make anything.
341 * Defragmentation also would be mistake, RA packets
342 * cannot be fragmented, because there is no warranty
343 * that different fragments will go along one path. --ANK
344 */
345 if (opt->ra) {
346 u8 *ptr = skb->nh.raw + opt->ra;
347 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
348 return 0;
349 }
350
351 /*
352 * check and decrement ttl
353 */
354 if (hdr->hop_limit <= 1) {
355 /* Force OUTPUT device used as source address */
356 skb->dev = dst->dev;
357 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
358 0, skb->dev);
359
360 kfree_skb(skb);
361 return -ETIMEDOUT;
362 }
363
364 if (!xfrm6_route_forward(skb)) {
365 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
366 goto drop;
367 }
368 dst = skb->dst;
369
370 /* IPv6 specs say nothing about it, but it is clear that we cannot
371 send redirects to source routed frames.
372 */
373 if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
374 struct in6_addr *target = NULL;
375 struct rt6_info *rt;
376 struct neighbour *n = dst->neighbour;
377
378 /*
379 * incoming and outgoing devices are the same
380 * send a redirect.
381 */
382
383 rt = (struct rt6_info *) dst;
384 if ((rt->rt6i_flags & RTF_GATEWAY))
385 target = (struct in6_addr*)&n->primary_key;
386 else
387 target = &hdr->daddr;
388
389 /* Limit redirects both by destination (here)
390 and by source (inside ndisc_send_redirect)
391 */
392 if (xrlim_allow(dst, 1*HZ))
393 ndisc_send_redirect(skb, n, target);
394 } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
395 |IPV6_ADDR_LINKLOCAL)) {
396 /* This check is security critical. */
397 goto error;
398 }
399
400 if (skb->len > dst_mtu(dst)) {
401 /* Again, force OUTPUT device used as source address */
402 skb->dev = dst->dev;
403 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
404 IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
405 IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
406 kfree_skb(skb);
407 return -EMSGSIZE;
408 }
409
410 if (skb_cow(skb, dst->dev->hard_header_len)) {
411 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
412 goto drop;
413 }
414
415 hdr = skb->nh.ipv6h;
416
417 /* Mangling hops number delayed to point after skb COW */
418
419 hdr->hop_limit--;
420
421 IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
422 return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
423
424error:
425 IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
426drop:
427 kfree_skb(skb);
428 return -EINVAL;
429}
430
431static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
432{
433 to->pkt_type = from->pkt_type;
434 to->priority = from->priority;
435 to->protocol = from->protocol;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700436 dst_release(to->dst);
437 to->dst = dst_clone(from->dst);
438 to->dev = from->dev;
439
440#ifdef CONFIG_NET_SCHED
441 to->tc_index = from->tc_index;
442#endif
443#ifdef CONFIG_NETFILTER
444 to->nfmark = from->nfmark;
445 /* Connection association is same as pre-frag packet */
Yasuyuki Kozakai9fb9cbb2005-11-09 16:38:16 -0800446 nf_conntrack_put(to->nfct);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700447 to->nfct = from->nfct;
448 nf_conntrack_get(to->nfct);
449 to->nfctinfo = from->nfctinfo;
Yasuyuki Kozakai9fb9cbb2005-11-09 16:38:16 -0800450#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
451 nf_conntrack_put_reasm(to->nfct_reasm);
452 to->nfct_reasm = from->nfct_reasm;
453 nf_conntrack_get_reasm(to->nfct_reasm);
454#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700455#ifdef CONFIG_BRIDGE_NETFILTER
456 nf_bridge_put(to->nf_bridge);
457 to->nf_bridge = from->nf_bridge;
458 nf_bridge_get(to->nf_bridge);
459#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700460#endif
461}
462
463int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
464{
465 u16 offset = sizeof(struct ipv6hdr);
466 struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
467 unsigned int packet_len = skb->tail - skb->nh.raw;
468 int found_rhdr = 0;
469 *nexthdr = &skb->nh.ipv6h->nexthdr;
470
471 while (offset + 1 <= packet_len) {
472
473 switch (**nexthdr) {
474
475 case NEXTHDR_HOP:
476 case NEXTHDR_ROUTING:
477 case NEXTHDR_DEST:
478 if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
479 if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
480 offset += ipv6_optlen(exthdr);
481 *nexthdr = &exthdr->nexthdr;
482 exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
483 break;
484 default :
485 return offset;
486 }
487 }
488
489 return offset;
490}
491
492static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
493{
494 struct net_device *dev;
495 struct sk_buff *frag;
496 struct rt6_info *rt = (struct rt6_info*)skb->dst;
497 struct ipv6hdr *tmp_hdr;
498 struct frag_hdr *fh;
499 unsigned int mtu, hlen, left, len;
500 u32 frag_id = 0;
501 int ptr, offset = 0, err=0;
502 u8 *prevhdr, nexthdr = 0;
503
504 dev = rt->u.dst.dev;
505 hlen = ip6_find_1stfragopt(skb, &prevhdr);
506 nexthdr = *prevhdr;
507
508 mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr);
509
510 if (skb_shinfo(skb)->frag_list) {
511 int first_len = skb_pagelen(skb);
512
513 if (first_len - hlen > mtu ||
514 ((first_len - hlen) & 7) ||
515 skb_cloned(skb))
516 goto slow_path;
517
518 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
519 /* Correct geometry. */
520 if (frag->len > mtu ||
521 ((frag->len & 7) && frag->next) ||
522 skb_headroom(frag) < hlen)
523 goto slow_path;
524
Linus Torvalds1da177e2005-04-16 15:20:36 -0700525 /* Partially cloned skb? */
526 if (skb_shared(frag))
527 goto slow_path;
Herbert Xu2fdba6b2005-05-18 22:52:33 -0700528
529 BUG_ON(frag->sk);
530 if (skb->sk) {
531 sock_hold(skb->sk);
532 frag->sk = skb->sk;
533 frag->destructor = sock_wfree;
534 skb->truesize -= frag->truesize;
535 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700536 }
537
538 err = 0;
539 offset = 0;
540 frag = skb_shinfo(skb)->frag_list;
541 skb_shinfo(skb)->frag_list = NULL;
542 /* BUILD HEADER */
543
544 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
545 if (!tmp_hdr) {
546 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
547 return -ENOMEM;
548 }
549
550 *prevhdr = NEXTHDR_FRAGMENT;
551 memcpy(tmp_hdr, skb->nh.raw, hlen);
552 __skb_pull(skb, hlen);
553 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
554 skb->nh.raw = __skb_push(skb, hlen);
555 memcpy(skb->nh.raw, tmp_hdr, hlen);
556
557 ipv6_select_ident(skb, fh);
558 fh->nexthdr = nexthdr;
559 fh->reserved = 0;
560 fh->frag_off = htons(IP6_MF);
561 frag_id = fh->identification;
562
563 first_len = skb_pagelen(skb);
564 skb->data_len = first_len - skb_headlen(skb);
565 skb->len = first_len;
566 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
567
568
569 for (;;) {
570 /* Prepare header of the next frame,
571 * before previous one went down. */
572 if (frag) {
573 frag->ip_summed = CHECKSUM_NONE;
574 frag->h.raw = frag->data;
575 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
576 frag->nh.raw = __skb_push(frag, hlen);
577 memcpy(frag->nh.raw, tmp_hdr, hlen);
578 offset += skb->len - hlen - sizeof(struct frag_hdr);
579 fh->nexthdr = nexthdr;
580 fh->reserved = 0;
581 fh->frag_off = htons(offset);
582 if (frag->next != NULL)
583 fh->frag_off |= htons(IP6_MF);
584 fh->identification = frag_id;
585 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
586 ip6_copy_metadata(frag, skb);
587 }
588
589 err = output(skb);
590 if (err || !frag)
591 break;
592
593 skb = frag;
594 frag = skb->next;
595 skb->next = NULL;
596 }
597
Jesper Juhla51482b2005-11-08 09:41:34 -0800598 kfree(tmp_hdr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700599
600 if (err == 0) {
601 IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
602 return 0;
603 }
604
605 while (frag) {
606 skb = frag->next;
607 kfree_skb(frag);
608 frag = skb;
609 }
610
611 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
612 return err;
613 }
614
615slow_path:
616 left = skb->len - hlen; /* Space per frame */
617 ptr = hlen; /* Where to start from */
618
619 /*
620 * Fragment the datagram.
621 */
622
623 *prevhdr = NEXTHDR_FRAGMENT;
624
625 /*
626 * Keep copying data until we run out.
627 */
628 while(left > 0) {
629 len = left;
630 /* IF: it doesn't fit, use 'mtu' - the data space left */
631 if (len > mtu)
632 len = mtu;
633 /* IF: we are not sending upto and including the packet end
634 then align the next start on an eight byte boundary */
635 if (len < left) {
636 len &= ~7;
637 }
638 /*
639 * Allocate buffer.
640 */
641
642 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
Patrick McHardy64ce2072005-08-09 20:50:53 -0700643 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700644 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
645 err = -ENOMEM;
646 goto fail;
647 }
648
649 /*
650 * Set up data on packet
651 */
652
653 ip6_copy_metadata(frag, skb);
654 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
655 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
656 frag->nh.raw = frag->data;
657 fh = (struct frag_hdr*)(frag->data + hlen);
658 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
659
660 /*
661 * Charge the memory for the fragment to any owner
662 * it might possess
663 */
664 if (skb->sk)
665 skb_set_owner_w(frag, skb->sk);
666
667 /*
668 * Copy the packet header into the new buffer.
669 */
670 memcpy(frag->nh.raw, skb->data, hlen);
671
672 /*
673 * Build fragment header.
674 */
675 fh->nexthdr = nexthdr;
676 fh->reserved = 0;
Yan Zhengf36d6ab2005-10-03 14:19:15 -0700677 if (!frag_id) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700678 ipv6_select_ident(skb, fh);
679 frag_id = fh->identification;
680 } else
681 fh->identification = frag_id;
682
683 /*
684 * Copy a block of the IP datagram.
685 */
686 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
687 BUG();
688 left -= len;
689
690 fh->frag_off = htons(offset);
691 if (left > 0)
692 fh->frag_off |= htons(IP6_MF);
693 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
694
695 ptr += len;
696 offset += len;
697
698 /*
699 * Put this fragment into the sending queue.
700 */
701
702 IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
703
704 err = output(frag);
705 if (err)
706 goto fail;
707 }
708 kfree_skb(skb);
709 IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
710 return err;
711
712fail:
713 kfree_skb(skb);
714 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
715 return err;
716}
717
718int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
719{
720 int err = 0;
721
722 *dst = NULL;
723 if (sk) {
724 struct ipv6_pinfo *np = inet6_sk(sk);
725
726 *dst = sk_dst_check(sk, np->dst_cookie);
727 if (*dst) {
728 struct rt6_info *rt = (struct rt6_info*)*dst;
729
730 /* Yes, checking route validity in not connected
731 case is not very simple. Take into account,
732 that we do not support routing by source, TOS,
733 and MSG_DONTROUTE --ANK (980726)
734
735 1. If route was host route, check that
736 cached destination is current.
737 If it is network route, we still may
738 check its validity using saved pointer
739 to the last used address: daddr_cache.
740 We do not want to save whole address now,
741 (because main consumer of this service
742 is tcp, which has not this problem),
743 so that the last trick works only on connected
744 sockets.
745 2. oif also should be the same.
746 */
747
748 if (((rt->rt6i_dst.plen != 128 ||
749 !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
750 && (np->daddr_cache == NULL ||
751 !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
752 || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
753 dst_release(*dst);
754 *dst = NULL;
755 }
756 }
757 }
758
759 if (*dst == NULL)
760 *dst = ip6_route_output(sk, fl);
761
762 if ((err = (*dst)->error))
763 goto out_err_release;
764
765 if (ipv6_addr_any(&fl->fl6_src)) {
766 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
767
Olaf Hering44456d32005-07-27 11:45:17 -0700768 if (err)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700769 goto out_err_release;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700770 }
771
772 return 0;
773
774out_err_release:
775 dst_release(*dst);
776 *dst = NULL;
777 return err;
778}
Adrian Bunk34a0b3c2005-11-29 16:28:56 -0800779
Arnaldo Carvalho de Melo3cf3dc62005-12-13 23:23:20 -0800780EXPORT_SYMBOL_GPL(ip6_dst_lookup);
781
Adrian Bunk34a0b3c2005-11-29 16:28:56 -0800782static inline int ip6_ufo_append_data(struct sock *sk,
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700783 int getfrag(void *from, char *to, int offset, int len,
784 int odd, struct sk_buff *skb),
785 void *from, int length, int hh_len, int fragheaderlen,
786 int transhdrlen, int mtu,unsigned int flags)
787
788{
789 struct sk_buff *skb;
790 int err;
791
792 /* There is support for UDP large send offload by network
793 * device, so create one single skb packet containing complete
794 * udp datagram
795 */
796 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
797 skb = sock_alloc_send_skb(sk,
798 hh_len + fragheaderlen + transhdrlen + 20,
799 (flags & MSG_DONTWAIT), &err);
800 if (skb == NULL)
801 return -ENOMEM;
802
803 /* reserve space for Hardware header */
804 skb_reserve(skb, hh_len);
805
806 /* create space for UDP/IP header */
807 skb_put(skb,fragheaderlen + transhdrlen);
808
809 /* initialize network header pointer */
810 skb->nh.raw = skb->data;
811
812 /* initialize protocol header pointer */
813 skb->h.raw = skb->data + fragheaderlen;
814
815 skb->ip_summed = CHECKSUM_HW;
816 skb->csum = 0;
817 sk->sk_sndmsg_off = 0;
818 }
819
820 err = skb_append_datato_frags(sk,skb, getfrag, from,
821 (length - transhdrlen));
822 if (!err) {
823 struct frag_hdr fhdr;
824
825 /* specify the length of each IP datagram fragment*/
826 skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen) -
827 sizeof(struct frag_hdr);
828 ipv6_select_ident(skb, &fhdr);
829 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
830 __skb_queue_tail(&sk->sk_write_queue, skb);
831
832 return 0;
833 }
834 /* There is not enough support do UPD LSO,
835 * so follow normal path
836 */
837 kfree_skb(skb);
838
839 return err;
840}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700841
YOSHIFUJI Hideaki41a1f8e2005-09-08 10:19:03 +0900842int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
843 int offset, int len, int odd, struct sk_buff *skb),
844 void *from, int length, int transhdrlen,
845 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
846 struct rt6_info *rt, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700847{
848 struct inet_sock *inet = inet_sk(sk);
849 struct ipv6_pinfo *np = inet6_sk(sk);
850 struct sk_buff *skb;
851 unsigned int maxfraglen, fragheaderlen;
852 int exthdrlen;
853 int hh_len;
854 int mtu;
855 int copy;
856 int err;
857 int offset = 0;
858 int csummode = CHECKSUM_NONE;
859
860 if (flags&MSG_PROBE)
861 return 0;
862 if (skb_queue_empty(&sk->sk_write_queue)) {
863 /*
864 * setup for corking
865 */
866 if (opt) {
867 if (np->cork.opt == NULL) {
868 np->cork.opt = kmalloc(opt->tot_len,
869 sk->sk_allocation);
870 if (unlikely(np->cork.opt == NULL))
871 return -ENOBUFS;
872 } else if (np->cork.opt->tot_len < opt->tot_len) {
873 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
874 return -EINVAL;
875 }
876 memcpy(np->cork.opt, opt, opt->tot_len);
877 inet->cork.flags |= IPCORK_OPT;
878 /* need source address above miyazawa*/
879 }
880 dst_hold(&rt->u.dst);
881 np->cork.rt = rt;
882 inet->cork.fl = *fl;
883 np->cork.hop_limit = hlimit;
YOSHIFUJI Hideaki41a1f8e2005-09-08 10:19:03 +0900884 np->cork.tclass = tclass;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700885 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
886 if (dst_allfrag(rt->u.dst.path))
887 inet->cork.flags |= IPCORK_ALLFRAG;
888 inet->cork.length = 0;
889 sk->sk_sndmsg_page = NULL;
890 sk->sk_sndmsg_off = 0;
891 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
892 length += exthdrlen;
893 transhdrlen += exthdrlen;
894 } else {
895 rt = np->cork.rt;
896 fl = &inet->cork.fl;
897 if (inet->cork.flags & IPCORK_OPT)
898 opt = np->cork.opt;
899 transhdrlen = 0;
900 exthdrlen = 0;
901 mtu = inet->cork.fragsize;
902 }
903
904 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
905
906 fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
907 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
908
909 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
910 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
911 ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
912 return -EMSGSIZE;
913 }
914 }
915
916 /*
917 * Let's try using as much space as possible.
918 * Use MTU if total length of the message fits into the MTU.
919 * Otherwise, we need to reserve fragment header and
920 * fragment alignment (= 8-15 octects, in total).
921 *
922 * Note that we may need to "move" the data from the tail of
923 * of the buffer to the new fragment when we split
924 * the message.
925 *
926 * FIXME: It may be fragmented into multiple chunks
927 * at once if non-fragmentable extension headers
928 * are too large.
929 * --yoshfuji
930 */
931
932 inet->cork.length += length;
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700933 if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
934 (rt->u.dst.dev->features & NETIF_F_UFO)) {
935
936 if(ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
937 fragheaderlen, transhdrlen, mtu, flags))
938 goto error;
939
940 return 0;
941 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700942
943 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
944 goto alloc_new_skb;
945
946 while (length > 0) {
947 /* Check if the remaining data fits into current packet. */
948 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
949 if (copy < length)
950 copy = maxfraglen - skb->len;
951
952 if (copy <= 0) {
953 char *data;
954 unsigned int datalen;
955 unsigned int fraglen;
956 unsigned int fraggap;
957 unsigned int alloclen;
958 struct sk_buff *skb_prev;
959alloc_new_skb:
960 skb_prev = skb;
961
962 /* There's no room in the current skb */
963 if (skb_prev)
964 fraggap = skb_prev->len - maxfraglen;
965 else
966 fraggap = 0;
967
968 /*
969 * If remaining data exceeds the mtu,
970 * we know we need more fragment(s).
971 */
972 datalen = length + fraggap;
973 if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
974 datalen = maxfraglen - fragheaderlen;
975
976 fraglen = datalen + fragheaderlen;
977 if ((flags & MSG_MORE) &&
978 !(rt->u.dst.dev->features&NETIF_F_SG))
979 alloclen = mtu;
980 else
981 alloclen = datalen + fragheaderlen;
982
983 /*
984 * The last fragment gets additional space at tail.
985 * Note: we overallocate on fragments with MSG_MODE
986 * because we have no idea if we're the last one.
987 */
988 if (datalen == length + fraggap)
989 alloclen += rt->u.dst.trailer_len;
990
991 /*
992 * We just reserve space for fragment header.
993 * Note: this may be overallocation if the message
994 * (without MSG_MORE) fits into the MTU.
995 */
996 alloclen += sizeof(struct frag_hdr);
997
998 if (transhdrlen) {
999 skb = sock_alloc_send_skb(sk,
1000 alloclen + hh_len,
1001 (flags & MSG_DONTWAIT), &err);
1002 } else {
1003 skb = NULL;
1004 if (atomic_read(&sk->sk_wmem_alloc) <=
1005 2 * sk->sk_sndbuf)
1006 skb = sock_wmalloc(sk,
1007 alloclen + hh_len, 1,
1008 sk->sk_allocation);
1009 if (unlikely(skb == NULL))
1010 err = -ENOBUFS;
1011 }
1012 if (skb == NULL)
1013 goto error;
1014 /*
1015 * Fill in the control structures
1016 */
1017 skb->ip_summed = csummode;
1018 skb->csum = 0;
1019 /* reserve for fragmentation */
1020 skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1021
1022 /*
1023 * Find where to start putting bytes
1024 */
1025 data = skb_put(skb, fraglen);
1026 skb->nh.raw = data + exthdrlen;
1027 data += fragheaderlen;
1028 skb->h.raw = data + exthdrlen;
1029
1030 if (fraggap) {
1031 skb->csum = skb_copy_and_csum_bits(
1032 skb_prev, maxfraglen,
1033 data + transhdrlen, fraggap, 0);
1034 skb_prev->csum = csum_sub(skb_prev->csum,
1035 skb->csum);
1036 data += fraggap;
1037 skb_trim(skb_prev, maxfraglen);
1038 }
1039 copy = datalen - transhdrlen - fraggap;
1040 if (copy < 0) {
1041 err = -EINVAL;
1042 kfree_skb(skb);
1043 goto error;
1044 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1045 err = -EFAULT;
1046 kfree_skb(skb);
1047 goto error;
1048 }
1049
1050 offset += copy;
1051 length -= datalen - fraggap;
1052 transhdrlen = 0;
1053 exthdrlen = 0;
1054 csummode = CHECKSUM_NONE;
1055
1056 /*
1057 * Put the packet on the pending queue
1058 */
1059 __skb_queue_tail(&sk->sk_write_queue, skb);
1060 continue;
1061 }
1062
1063 if (copy > length)
1064 copy = length;
1065
1066 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1067 unsigned int off;
1068
1069 off = skb->len;
1070 if (getfrag(from, skb_put(skb, copy),
1071 offset, copy, off, skb) < 0) {
1072 __skb_trim(skb, off);
1073 err = -EFAULT;
1074 goto error;
1075 }
1076 } else {
1077 int i = skb_shinfo(skb)->nr_frags;
1078 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1079 struct page *page = sk->sk_sndmsg_page;
1080 int off = sk->sk_sndmsg_off;
1081 unsigned int left;
1082
1083 if (page && (left = PAGE_SIZE - off) > 0) {
1084 if (copy >= left)
1085 copy = left;
1086 if (page != frag->page) {
1087 if (i == MAX_SKB_FRAGS) {
1088 err = -EMSGSIZE;
1089 goto error;
1090 }
1091 get_page(page);
1092 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1093 frag = &skb_shinfo(skb)->frags[i];
1094 }
1095 } else if(i < MAX_SKB_FRAGS) {
1096 if (copy > PAGE_SIZE)
1097 copy = PAGE_SIZE;
1098 page = alloc_pages(sk->sk_allocation, 0);
1099 if (page == NULL) {
1100 err = -ENOMEM;
1101 goto error;
1102 }
1103 sk->sk_sndmsg_page = page;
1104 sk->sk_sndmsg_off = 0;
1105
1106 skb_fill_page_desc(skb, i, page, 0, 0);
1107 frag = &skb_shinfo(skb)->frags[i];
1108 skb->truesize += PAGE_SIZE;
1109 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1110 } else {
1111 err = -EMSGSIZE;
1112 goto error;
1113 }
1114 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1115 err = -EFAULT;
1116 goto error;
1117 }
1118 sk->sk_sndmsg_off += copy;
1119 frag->size += copy;
1120 skb->len += copy;
1121 skb->data_len += copy;
1122 }
1123 offset += copy;
1124 length -= copy;
1125 }
1126 return 0;
1127error:
1128 inet->cork.length -= length;
1129 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1130 return err;
1131}
1132
1133int ip6_push_pending_frames(struct sock *sk)
1134{
1135 struct sk_buff *skb, *tmp_skb;
1136 struct sk_buff **tail_skb;
1137 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1138 struct inet_sock *inet = inet_sk(sk);
1139 struct ipv6_pinfo *np = inet6_sk(sk);
1140 struct ipv6hdr *hdr;
1141 struct ipv6_txoptions *opt = np->cork.opt;
1142 struct rt6_info *rt = np->cork.rt;
1143 struct flowi *fl = &inet->cork.fl;
1144 unsigned char proto = fl->proto;
1145 int err = 0;
1146
1147 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1148 goto out;
1149 tail_skb = &(skb_shinfo(skb)->frag_list);
1150
1151 /* move skb->data to ip header from ext header */
1152 if (skb->data < skb->nh.raw)
1153 __skb_pull(skb, skb->nh.raw - skb->data);
1154 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1155 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1156 *tail_skb = tmp_skb;
1157 tail_skb = &(tmp_skb->next);
1158 skb->len += tmp_skb->len;
1159 skb->data_len += tmp_skb->len;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001160 skb->truesize += tmp_skb->truesize;
1161 __sock_put(tmp_skb->sk);
1162 tmp_skb->destructor = NULL;
1163 tmp_skb->sk = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001164 }
1165
1166 ipv6_addr_copy(final_dst, &fl->fl6_dst);
1167 __skb_pull(skb, skb->h.raw - skb->nh.raw);
1168 if (opt && opt->opt_flen)
1169 ipv6_push_frag_opts(skb, opt, &proto);
1170 if (opt && opt->opt_nflen)
1171 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1172
1173 skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1174
YOSHIFUJI Hideaki41a1f8e2005-09-08 10:19:03 +09001175 *(u32*)hdr = fl->fl6_flowlabel |
1176 htonl(0x60000000 | ((int)np->cork.tclass << 20));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001177
1178 if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1179 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1180 else
1181 hdr->payload_len = 0;
1182 hdr->hop_limit = np->cork.hop_limit;
1183 hdr->nexthdr = proto;
1184 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1185 ipv6_addr_copy(&hdr->daddr, final_dst);
1186
Patrick McHardya2c20642006-01-08 22:37:26 -08001187 skb->priority = sk->sk_priority;
1188
Linus Torvalds1da177e2005-04-16 15:20:36 -07001189 skb->dst = dst_clone(&rt->u.dst);
1190 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
1191 err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1192 if (err) {
1193 if (err > 0)
Herbert Xu3320da82005-04-19 22:32:22 -07001194 err = np->recverr ? net_xmit_errno(err) : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001195 if (err)
1196 goto error;
1197 }
1198
1199out:
1200 inet->cork.flags &= ~IPCORK_OPT;
Jesper Juhla51482b2005-11-08 09:41:34 -08001201 kfree(np->cork.opt);
1202 np->cork.opt = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001203 if (np->cork.rt) {
1204 dst_release(&np->cork.rt->u.dst);
1205 np->cork.rt = NULL;
1206 inet->cork.flags &= ~IPCORK_ALLFRAG;
1207 }
1208 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1209 return err;
1210error:
1211 goto out;
1212}
1213
1214void ip6_flush_pending_frames(struct sock *sk)
1215{
1216 struct inet_sock *inet = inet_sk(sk);
1217 struct ipv6_pinfo *np = inet6_sk(sk);
1218 struct sk_buff *skb;
1219
1220 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1221 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1222 kfree_skb(skb);
1223 }
1224
1225 inet->cork.flags &= ~IPCORK_OPT;
1226
Jesper Juhla51482b2005-11-08 09:41:34 -08001227 kfree(np->cork.opt);
1228 np->cork.opt = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001229 if (np->cork.rt) {
1230 dst_release(&np->cork.rt->u.dst);
1231 np->cork.rt = NULL;
1232 inet->cork.flags &= ~IPCORK_ALLFRAG;
1233 }
1234 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1235}