blob: eba64e2bd397c2f219bb4c545a5704f5bf892630 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * The Internet Protocol (IP) output module.
7 *
8 * Version: $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
9 *
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Donald Becker, <becker@super.org>
13 * Alan Cox, <Alan.Cox@linux.org>
14 * Richard Underwood
15 * Stefan Becker, <stefanb@yello.ping.de>
16 * Jorge Cwik, <jorge@laser.satlink.net>
17 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18 * Hirokazu Takahashi, <taka@valinux.co.jp>
19 *
20 * See ip_input.c for original log
21 *
22 * Fixes:
23 * Alan Cox : Missing nonblock feature in ip_build_xmit.
24 * Mike Kilburn : htons() missing in ip_build_xmit.
25 * Bradford Johnson: Fix faulty handling of some frames when
26 * no route is found.
27 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
28 * (in case if packet not accepted by
29 * output firewall rules)
30 * Mike McLagan : Routing by source
31 * Alexey Kuznetsov: use new route cache
32 * Andi Kleen: Fix broken PMTU recovery and remove
33 * some redundant tests.
34 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
35 * Andi Kleen : Replace ip_reply with ip_send_reply.
36 * Andi Kleen : Split fast and slow ip_build_xmit path
37 * for decreased register pressure on x86
38 * and more readibility.
39 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
40 * silently drop skb instead of failing with -EPERM.
41 * Detlev Wengorz : Copy protocol for fragments.
42 * Hirokazu Takahashi: HW checksumming for outgoing UDP
43 * datagrams.
44 * Hirokazu Takahashi: sendfile() on UDP works now.
45 */
46
47#include <asm/uaccess.h>
48#include <asm/system.h>
49#include <linux/module.h>
50#include <linux/types.h>
51#include <linux/kernel.h>
52#include <linux/sched.h>
53#include <linux/mm.h>
54#include <linux/string.h>
55#include <linux/errno.h>
56#include <linux/config.h>
57
58#include <linux/socket.h>
59#include <linux/sockios.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/etherdevice.h>
64#include <linux/proc_fs.h>
65#include <linux/stat.h>
66#include <linux/init.h>
67
68#include <net/snmp.h>
69#include <net/ip.h>
70#include <net/protocol.h>
71#include <net/route.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070072#include <linux/skbuff.h>
73#include <net/sock.h>
74#include <net/arp.h>
75#include <net/icmp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070076#include <net/checksum.h>
77#include <net/inetpeer.h>
78#include <net/checksum.h>
79#include <linux/igmp.h>
80#include <linux/netfilter_ipv4.h>
81#include <linux/netfilter_bridge.h>
82#include <linux/mroute.h>
83#include <linux/netlink.h>
Arnaldo Carvalho de Melo6cbb0df2005-08-09 19:49:02 -070084#include <linux/tcp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070085
Linus Torvalds1da177e2005-04-16 15:20:36 -070086int sysctl_ip_default_ttl = IPDEFTTL;
87
88/* Generate a checksum for an outgoing IP datagram. */
89__inline__ void ip_send_check(struct iphdr *iph)
90{
91 iph->check = 0;
92 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
93}
94
95/* dev_loopback_xmit for use with netfilter. */
96static int ip_dev_loopback_xmit(struct sk_buff *newskb)
97{
98 newskb->mac.raw = newskb->data;
99 __skb_pull(newskb, newskb->nh.raw - newskb->data);
100 newskb->pkt_type = PACKET_LOOPBACK;
101 newskb->ip_summed = CHECKSUM_UNNECESSARY;
102 BUG_TRAP(newskb->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700103 netif_rx(newskb);
104 return 0;
105}
106
107static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
108{
109 int ttl = inet->uc_ttl;
110
111 if (ttl < 0)
112 ttl = dst_metric(dst, RTAX_HOPLIMIT);
113 return ttl;
114}
115
116/*
117 * Add an ip header to a skbuff and send it out.
118 *
119 */
120int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
121 u32 saddr, u32 daddr, struct ip_options *opt)
122{
123 struct inet_sock *inet = inet_sk(sk);
124 struct rtable *rt = (struct rtable *)skb->dst;
125 struct iphdr *iph;
126
127 /* Build the IP header. */
128 if (opt)
129 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
130 else
131 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
132
133 iph->version = 4;
134 iph->ihl = 5;
135 iph->tos = inet->tos;
136 if (ip_dont_fragment(sk, &rt->u.dst))
137 iph->frag_off = htons(IP_DF);
138 else
139 iph->frag_off = 0;
140 iph->ttl = ip_select_ttl(inet, &rt->u.dst);
141 iph->daddr = rt->rt_dst;
142 iph->saddr = rt->rt_src;
143 iph->protocol = sk->sk_protocol;
144 iph->tot_len = htons(skb->len);
145 ip_select_ident(iph, &rt->u.dst, sk);
146 skb->nh.iph = iph;
147
148 if (opt && opt->optlen) {
149 iph->ihl += opt->optlen>>2;
150 ip_options_build(skb, opt, daddr, rt, 0);
151 }
152 ip_send_check(iph);
153
154 skb->priority = sk->sk_priority;
155
156 /* Send it out. */
157 return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
158 dst_output);
159}
160
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -0700161EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
162
Linus Torvalds1da177e2005-04-16 15:20:36 -0700163static inline int ip_finish_output2(struct sk_buff *skb)
164{
165 struct dst_entry *dst = skb->dst;
166 struct hh_cache *hh = dst->hh;
167 struct net_device *dev = dst->dev;
168 int hh_len = LL_RESERVED_SPACE(dev);
169
170 /* Be paranoid, rather than too clever. */
171 if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
172 struct sk_buff *skb2;
173
174 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
175 if (skb2 == NULL) {
176 kfree_skb(skb);
177 return -ENOMEM;
178 }
179 if (skb->sk)
180 skb_set_owner_w(skb2, skb->sk);
181 kfree_skb(skb);
182 skb = skb2;
183 }
184
Linus Torvalds1da177e2005-04-16 15:20:36 -0700185 if (hh) {
186 int hh_alen;
187
188 read_lock_bh(&hh->hh_lock);
189 hh_alen = HH_DATA_ALIGN(hh->hh_len);
190 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
191 read_unlock_bh(&hh->hh_lock);
192 skb_push(skb, hh->hh_len);
193 return hh->hh_output(skb);
194 } else if (dst->neighbour)
195 return dst->neighbour->output(skb);
196
197 if (net_ratelimit())
198 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
199 kfree_skb(skb);
200 return -EINVAL;
201}
202
Thomas Graf33d043d2005-08-20 17:27:34 -0700203static inline int ip_finish_output(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700204{
205 struct net_device *dev = skb->dst->dev;
206
207 skb->dev = dev;
208 skb->protocol = htons(ETH_P_IP);
209
210 return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
211 ip_finish_output2);
212}
213
214int ip_mc_output(struct sk_buff *skb)
215{
216 struct sock *sk = skb->sk;
217 struct rtable *rt = (struct rtable*)skb->dst;
218 struct net_device *dev = rt->u.dst.dev;
219
220 /*
221 * If the indicated interface is up and running, send the packet.
222 */
223 IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
224
225 skb->dev = dev;
226 skb->protocol = htons(ETH_P_IP);
227
228 /*
229 * Multicasts are looped back for other local users
230 */
231
232 if (rt->rt_flags&RTCF_MULTICAST) {
233 if ((!sk || inet_sk(sk)->mc_loop)
234#ifdef CONFIG_IP_MROUTE
235 /* Small optimization: do not loopback not local frames,
236 which returned after forwarding; they will be dropped
237 by ip_mr_input in any case.
238 Note, that local frames are looped back to be delivered
239 to local recipients.
240
241 This check is duplicated in ip_mr_input at the moment.
242 */
243 && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
244#endif
245 ) {
246 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
247 if (newskb)
248 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
249 newskb->dev,
250 ip_dev_loopback_xmit);
251 }
252
253 /* Multicasts with ttl 0 must not go beyond the host */
254
255 if (skb->nh.iph->ttl == 0) {
256 kfree_skb(skb);
257 return 0;
258 }
259 }
260
261 if (rt->rt_flags&RTCF_BROADCAST) {
262 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
263 if (newskb)
264 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
265 newskb->dev, ip_dev_loopback_xmit);
266 }
267
268 if (skb->len > dst_mtu(&rt->u.dst))
269 return ip_fragment(skb, ip_finish_output);
270 else
271 return ip_finish_output(skb);
272}
273
274int ip_output(struct sk_buff *skb)
275{
276 IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
277
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700278 if (skb->len > dst_mtu(skb->dst) &&
279 !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700280 return ip_fragment(skb, ip_finish_output);
281 else
282 return ip_finish_output(skb);
283}
284
285int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
286{
287 struct sock *sk = skb->sk;
288 struct inet_sock *inet = inet_sk(sk);
289 struct ip_options *opt = inet->opt;
290 struct rtable *rt;
291 struct iphdr *iph;
292
293 /* Skip all of this if the packet is already routed,
294 * f.e. by something like SCTP.
295 */
296 rt = (struct rtable *) skb->dst;
297 if (rt != NULL)
298 goto packet_routed;
299
300 /* Make sure we can route this packet. */
301 rt = (struct rtable *)__sk_dst_check(sk, 0);
302 if (rt == NULL) {
303 u32 daddr;
304
305 /* Use correct destination address if we have options. */
306 daddr = inet->daddr;
307 if(opt && opt->srr)
308 daddr = opt->faddr;
309
310 {
311 struct flowi fl = { .oif = sk->sk_bound_dev_if,
312 .nl_u = { .ip4_u =
313 { .daddr = daddr,
314 .saddr = inet->saddr,
315 .tos = RT_CONN_FLAGS(sk) } },
316 .proto = sk->sk_protocol,
317 .uli_u = { .ports =
318 { .sport = inet->sport,
319 .dport = inet->dport } } };
320
321 /* If this fails, retransmit mechanism of transport layer will
322 * keep trying until route appears or the connection times
323 * itself out.
324 */
325 if (ip_route_output_flow(&rt, &fl, sk, 0))
326 goto no_route;
327 }
Arnaldo Carvalho de Melo6cbb0df2005-08-09 19:49:02 -0700328 sk_setup_caps(sk, &rt->u.dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700329 }
330 skb->dst = dst_clone(&rt->u.dst);
331
332packet_routed:
333 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
334 goto no_route;
335
336 /* OK, we know where to send it, allocate and build IP header. */
337 iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
338 *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
339 iph->tot_len = htons(skb->len);
340 if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
341 iph->frag_off = htons(IP_DF);
342 else
343 iph->frag_off = 0;
344 iph->ttl = ip_select_ttl(inet, &rt->u.dst);
345 iph->protocol = sk->sk_protocol;
346 iph->saddr = rt->rt_src;
347 iph->daddr = rt->rt_dst;
348 skb->nh.iph = iph;
349 /* Transport layer set skb->h.foo itself. */
350
351 if (opt && opt->optlen) {
352 iph->ihl += opt->optlen >> 2;
353 ip_options_build(skb, opt, inet->daddr, rt, 0);
354 }
355
Herbert Xu89f5f0a2005-11-08 09:41:56 -0800356 ip_select_ident_more(iph, &rt->u.dst, sk,
357 (skb_shinfo(skb)->tso_segs ?: 1) - 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700358
359 /* Add an IP checksum. */
360 ip_send_check(iph);
361
362 skb->priority = sk->sk_priority;
363
364 return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
365 dst_output);
366
367no_route:
368 IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
369 kfree_skb(skb);
370 return -EHOSTUNREACH;
371}
372
373
374static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
375{
376 to->pkt_type = from->pkt_type;
377 to->priority = from->priority;
378 to->protocol = from->protocol;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700379 dst_release(to->dst);
380 to->dst = dst_clone(from->dst);
381 to->dev = from->dev;
382
383 /* Copy the flags to each fragment. */
384 IPCB(to)->flags = IPCB(from)->flags;
385
386#ifdef CONFIG_NET_SCHED
387 to->tc_index = from->tc_index;
388#endif
389#ifdef CONFIG_NETFILTER
390 to->nfmark = from->nfmark;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700391 /* Connection association is same as pre-frag packet */
392 nf_conntrack_put(to->nfct);
393 to->nfct = from->nfct;
394 nf_conntrack_get(to->nfct);
395 to->nfctinfo = from->nfctinfo;
Julian Anastasovc98d80e2005-10-22 13:39:21 +0300396#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
397 to->ipvs_property = from->ipvs_property;
398#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700399#ifdef CONFIG_BRIDGE_NETFILTER
400 nf_bridge_put(to->nf_bridge);
401 to->nf_bridge = from->nf_bridge;
402 nf_bridge_get(to->nf_bridge);
403#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700404#endif
405}
406
407/*
408 * This IP datagram is too large to be sent in one piece. Break it up into
409 * smaller pieces (each of size equal to IP header plus
410 * a block of the data of the original IP data part) that will yet fit in a
411 * single device frame, and queue such a frame for sending.
412 */
413
414int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
415{
416 struct iphdr *iph;
417 int raw = 0;
418 int ptr;
419 struct net_device *dev;
420 struct sk_buff *skb2;
421 unsigned int mtu, hlen, left, len, ll_rs;
422 int offset;
423 int not_last_frag;
424 struct rtable *rt = (struct rtable*)skb->dst;
425 int err = 0;
426
427 dev = rt->u.dst.dev;
428
429 /*
430 * Point into the IP datagram header.
431 */
432
433 iph = skb->nh.iph;
434
435 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
436 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
437 htonl(dst_mtu(&rt->u.dst)));
438 kfree_skb(skb);
439 return -EMSGSIZE;
440 }
441
442 /*
443 * Setup starting values.
444 */
445
446 hlen = iph->ihl * 4;
447 mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */
448
449 /* When frag_list is given, use it. First, check its validity:
450 * some transformers could create wrong frag_list or break existing
451 * one, it is not prohibited. In this case fall back to copying.
452 *
453 * LATER: this step can be merged to real generation of fragments,
454 * we can switch to copy when see the first bad fragment.
455 */
456 if (skb_shinfo(skb)->frag_list) {
457 struct sk_buff *frag;
458 int first_len = skb_pagelen(skb);
459
460 if (first_len - hlen > mtu ||
461 ((first_len - hlen) & 7) ||
462 (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
463 skb_cloned(skb))
464 goto slow_path;
465
466 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
467 /* Correct geometry. */
468 if (frag->len > mtu ||
469 ((frag->len & 7) && frag->next) ||
470 skb_headroom(frag) < hlen)
471 goto slow_path;
472
473 /* Partially cloned skb? */
474 if (skb_shared(frag))
475 goto slow_path;
Herbert Xu2fdba6b2005-05-18 22:52:33 -0700476
477 BUG_ON(frag->sk);
478 if (skb->sk) {
479 sock_hold(skb->sk);
480 frag->sk = skb->sk;
481 frag->destructor = sock_wfree;
482 skb->truesize -= frag->truesize;
483 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700484 }
485
486 /* Everything is OK. Generate! */
487
488 err = 0;
489 offset = 0;
490 frag = skb_shinfo(skb)->frag_list;
491 skb_shinfo(skb)->frag_list = NULL;
492 skb->data_len = first_len - skb_headlen(skb);
493 skb->len = first_len;
494 iph->tot_len = htons(first_len);
495 iph->frag_off = htons(IP_MF);
496 ip_send_check(iph);
497
498 for (;;) {
499 /* Prepare header of the next frame,
500 * before previous one went down. */
501 if (frag) {
502 frag->ip_summed = CHECKSUM_NONE;
503 frag->h.raw = frag->data;
504 frag->nh.raw = __skb_push(frag, hlen);
505 memcpy(frag->nh.raw, iph, hlen);
506 iph = frag->nh.iph;
507 iph->tot_len = htons(frag->len);
508 ip_copy_metadata(frag, skb);
509 if (offset == 0)
510 ip_options_fragment(frag);
511 offset += skb->len - hlen;
512 iph->frag_off = htons(offset>>3);
513 if (frag->next != NULL)
514 iph->frag_off |= htons(IP_MF);
515 /* Ready, complete checksum */
516 ip_send_check(iph);
517 }
518
519 err = output(skb);
520
521 if (err || !frag)
522 break;
523
524 skb = frag;
525 frag = skb->next;
526 skb->next = NULL;
527 }
528
529 if (err == 0) {
530 IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
531 return 0;
532 }
533
534 while (frag) {
535 skb = frag->next;
536 kfree_skb(frag);
537 frag = skb;
538 }
539 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
540 return err;
541 }
542
543slow_path:
544 left = skb->len - hlen; /* Space per frame */
545 ptr = raw + hlen; /* Where to start from */
546
547#ifdef CONFIG_BRIDGE_NETFILTER
548 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
549 * we need to make room for the encapsulating header */
550 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb));
551 mtu -= nf_bridge_pad(skb);
552#else
553 ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev);
554#endif
555 /*
556 * Fragment the datagram.
557 */
558
559 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
560 not_last_frag = iph->frag_off & htons(IP_MF);
561
562 /*
563 * Keep copying data until we run out.
564 */
565
566 while(left > 0) {
567 len = left;
568 /* IF: it doesn't fit, use 'mtu' - the data space left */
569 if (len > mtu)
570 len = mtu;
571 /* IF: we are not sending upto and including the packet end
572 then align the next start on an eight byte boundary */
573 if (len < left) {
574 len &= ~7;
575 }
576 /*
577 * Allocate buffer.
578 */
579
580 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
Patrick McHardy64ce2072005-08-09 20:50:53 -0700581 NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700582 err = -ENOMEM;
583 goto fail;
584 }
585
586 /*
587 * Set up data on packet
588 */
589
590 ip_copy_metadata(skb2, skb);
591 skb_reserve(skb2, ll_rs);
592 skb_put(skb2, len + hlen);
593 skb2->nh.raw = skb2->data;
594 skb2->h.raw = skb2->data + hlen;
595
596 /*
597 * Charge the memory for the fragment to any owner
598 * it might possess
599 */
600
601 if (skb->sk)
602 skb_set_owner_w(skb2, skb->sk);
603
604 /*
605 * Copy the packet header into the new buffer.
606 */
607
608 memcpy(skb2->nh.raw, skb->data, hlen);
609
610 /*
611 * Copy a block of the IP datagram.
612 */
613 if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
614 BUG();
615 left -= len;
616
617 /*
618 * Fill in the new header fields.
619 */
620 iph = skb2->nh.iph;
621 iph->frag_off = htons((offset >> 3));
622
623 /* ANK: dirty, but effective trick. Upgrade options only if
624 * the segment to be fragmented was THE FIRST (otherwise,
625 * options are already fixed) and make it ONCE
626 * on the initial skb, so that all the following fragments
627 * will inherit fixed options.
628 */
629 if (offset == 0)
630 ip_options_fragment(skb);
631
632 /*
633 * Added AC : If we are fragmenting a fragment that's not the
634 * last fragment then keep MF on each bit
635 */
636 if (left > 0 || not_last_frag)
637 iph->frag_off |= htons(IP_MF);
638 ptr += len;
639 offset += len;
640
641 /*
642 * Put this fragment into the sending queue.
643 */
644
645 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
646
647 iph->tot_len = htons(len + hlen);
648
649 ip_send_check(iph);
650
651 err = output(skb2);
652 if (err)
653 goto fail;
654 }
655 kfree_skb(skb);
656 IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
657 return err;
658
659fail:
660 kfree_skb(skb);
661 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
662 return err;
663}
664
665int
666ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
667{
668 struct iovec *iov = from;
669
670 if (skb->ip_summed == CHECKSUM_HW) {
671 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
672 return -EFAULT;
673 } else {
674 unsigned int csum = 0;
675 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
676 return -EFAULT;
677 skb->csum = csum_block_add(skb->csum, csum, odd);
678 }
679 return 0;
680}
681
682static inline unsigned int
683csum_page(struct page *page, int offset, int copy)
684{
685 char *kaddr;
686 unsigned int csum;
687 kaddr = kmap(page);
688 csum = csum_partial(kaddr + offset, copy, 0);
689 kunmap(page);
690 return csum;
691}
692
Adrian Bunk4b30b1c2005-11-29 16:27:20 -0800693static inline int ip_ufo_append_data(struct sock *sk,
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700694 int getfrag(void *from, char *to, int offset, int len,
695 int odd, struct sk_buff *skb),
696 void *from, int length, int hh_len, int fragheaderlen,
697 int transhdrlen, int mtu,unsigned int flags)
698{
699 struct sk_buff *skb;
700 int err;
701
702 /* There is support for UDP fragmentation offload by network
703 * device, so create one single skb packet containing complete
704 * udp datagram
705 */
706 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
707 skb = sock_alloc_send_skb(sk,
708 hh_len + fragheaderlen + transhdrlen + 20,
709 (flags & MSG_DONTWAIT), &err);
710
711 if (skb == NULL)
712 return err;
713
714 /* reserve space for Hardware header */
715 skb_reserve(skb, hh_len);
716
717 /* create space for UDP/IP header */
718 skb_put(skb,fragheaderlen + transhdrlen);
719
720 /* initialize network header pointer */
721 skb->nh.raw = skb->data;
722
723 /* initialize protocol header pointer */
724 skb->h.raw = skb->data + fragheaderlen;
725
726 skb->ip_summed = CHECKSUM_HW;
727 skb->csum = 0;
728 sk->sk_sndmsg_off = 0;
729 }
730
731 err = skb_append_datato_frags(sk,skb, getfrag, from,
732 (length - transhdrlen));
733 if (!err) {
734 /* specify the length of each IP datagram fragment*/
735 skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen);
736 __skb_queue_tail(&sk->sk_write_queue, skb);
737
738 return 0;
739 }
740 /* There is not enough support do UFO ,
741 * so follow normal path
742 */
743 kfree_skb(skb);
744 return err;
745}
746
Linus Torvalds1da177e2005-04-16 15:20:36 -0700747/*
748 * ip_append_data() and ip_append_page() can make one large IP datagram
749 * from many pieces of data. Each pieces will be holded on the socket
750 * until ip_push_pending_frames() is called. Each piece can be a page
751 * or non-page data.
752 *
753 * Not only UDP, other transport protocols - e.g. raw sockets - can use
754 * this interface potentially.
755 *
756 * LATER: length must be adjusted by pad at tail, when it is required.
757 */
758int ip_append_data(struct sock *sk,
759 int getfrag(void *from, char *to, int offset, int len,
760 int odd, struct sk_buff *skb),
761 void *from, int length, int transhdrlen,
762 struct ipcm_cookie *ipc, struct rtable *rt,
763 unsigned int flags)
764{
765 struct inet_sock *inet = inet_sk(sk);
766 struct sk_buff *skb;
767
768 struct ip_options *opt = NULL;
769 int hh_len;
770 int exthdrlen;
771 int mtu;
772 int copy;
773 int err;
774 int offset = 0;
775 unsigned int maxfraglen, fragheaderlen;
776 int csummode = CHECKSUM_NONE;
777
778 if (flags&MSG_PROBE)
779 return 0;
780
781 if (skb_queue_empty(&sk->sk_write_queue)) {
782 /*
783 * setup for corking.
784 */
785 opt = ipc->opt;
786 if (opt) {
787 if (inet->cork.opt == NULL) {
788 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
789 if (unlikely(inet->cork.opt == NULL))
790 return -ENOBUFS;
791 }
792 memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
793 inet->cork.flags |= IPCORK_OPT;
794 inet->cork.addr = ipc->addr;
795 }
796 dst_hold(&rt->u.dst);
797 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
798 inet->cork.rt = rt;
799 inet->cork.length = 0;
800 sk->sk_sndmsg_page = NULL;
801 sk->sk_sndmsg_off = 0;
802 if ((exthdrlen = rt->u.dst.header_len) != 0) {
803 length += exthdrlen;
804 transhdrlen += exthdrlen;
805 }
806 } else {
807 rt = inet->cork.rt;
808 if (inet->cork.flags & IPCORK_OPT)
809 opt = inet->cork.opt;
810
811 transhdrlen = 0;
812 exthdrlen = 0;
813 mtu = inet->cork.fragsize;
814 }
815 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
816
817 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
818 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
819
820 if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
821 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
822 return -EMSGSIZE;
823 }
824
825 /*
826 * transhdrlen > 0 means that this is the first fragment and we wish
827 * it won't be fragmented in the future.
828 */
829 if (transhdrlen &&
830 length + fragheaderlen <= mtu &&
831 rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
832 !exthdrlen)
833 csummode = CHECKSUM_HW;
834
835 inet->cork.length += length;
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700836 if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
837 (rt->u.dst.dev->features & NETIF_F_UFO)) {
838
839 if(ip_ufo_append_data(sk, getfrag, from, length, hh_len,
840 fragheaderlen, transhdrlen, mtu, flags))
841 goto error;
842
843 return 0;
844 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700845
846 /* So, what's going on in the loop below?
847 *
848 * We use calculated fragment length to generate chained skb,
849 * each of segments is IP fragment ready for sending to network after
850 * adding appropriate IP header.
851 */
852
853 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
854 goto alloc_new_skb;
855
856 while (length > 0) {
857 /* Check if the remaining data fits into current packet. */
858 copy = mtu - skb->len;
859 if (copy < length)
860 copy = maxfraglen - skb->len;
861 if (copy <= 0) {
862 char *data;
863 unsigned int datalen;
864 unsigned int fraglen;
865 unsigned int fraggap;
866 unsigned int alloclen;
867 struct sk_buff *skb_prev;
868alloc_new_skb:
869 skb_prev = skb;
870 if (skb_prev)
871 fraggap = skb_prev->len - maxfraglen;
872 else
873 fraggap = 0;
874
875 /*
876 * If remaining data exceeds the mtu,
877 * we know we need more fragment(s).
878 */
879 datalen = length + fraggap;
880 if (datalen > mtu - fragheaderlen)
881 datalen = maxfraglen - fragheaderlen;
882 fraglen = datalen + fragheaderlen;
883
884 if ((flags & MSG_MORE) &&
885 !(rt->u.dst.dev->features&NETIF_F_SG))
886 alloclen = mtu;
887 else
888 alloclen = datalen + fragheaderlen;
889
890 /* The last fragment gets additional space at tail.
891 * Note, with MSG_MORE we overallocate on fragments,
892 * because we have no idea what fragment will be
893 * the last.
894 */
895 if (datalen == length)
896 alloclen += rt->u.dst.trailer_len;
897
898 if (transhdrlen) {
899 skb = sock_alloc_send_skb(sk,
900 alloclen + hh_len + 15,
901 (flags & MSG_DONTWAIT), &err);
902 } else {
903 skb = NULL;
904 if (atomic_read(&sk->sk_wmem_alloc) <=
905 2 * sk->sk_sndbuf)
906 skb = sock_wmalloc(sk,
907 alloclen + hh_len + 15, 1,
908 sk->sk_allocation);
909 if (unlikely(skb == NULL))
910 err = -ENOBUFS;
911 }
912 if (skb == NULL)
913 goto error;
914
915 /*
916 * Fill in the control structures
917 */
918 skb->ip_summed = csummode;
919 skb->csum = 0;
920 skb_reserve(skb, hh_len);
921
922 /*
923 * Find where to start putting bytes.
924 */
925 data = skb_put(skb, fraglen);
926 skb->nh.raw = data + exthdrlen;
927 data += fragheaderlen;
928 skb->h.raw = data + exthdrlen;
929
930 if (fraggap) {
931 skb->csum = skb_copy_and_csum_bits(
932 skb_prev, maxfraglen,
933 data + transhdrlen, fraggap, 0);
934 skb_prev->csum = csum_sub(skb_prev->csum,
935 skb->csum);
936 data += fraggap;
937 skb_trim(skb_prev, maxfraglen);
938 }
939
940 copy = datalen - transhdrlen - fraggap;
941 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
942 err = -EFAULT;
943 kfree_skb(skb);
944 goto error;
945 }
946
947 offset += copy;
948 length -= datalen - fraggap;
949 transhdrlen = 0;
950 exthdrlen = 0;
951 csummode = CHECKSUM_NONE;
952
953 /*
954 * Put the packet on the pending queue.
955 */
956 __skb_queue_tail(&sk->sk_write_queue, skb);
957 continue;
958 }
959
960 if (copy > length)
961 copy = length;
962
963 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
964 unsigned int off;
965
966 off = skb->len;
967 if (getfrag(from, skb_put(skb, copy),
968 offset, copy, off, skb) < 0) {
969 __skb_trim(skb, off);
970 err = -EFAULT;
971 goto error;
972 }
973 } else {
974 int i = skb_shinfo(skb)->nr_frags;
975 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
976 struct page *page = sk->sk_sndmsg_page;
977 int off = sk->sk_sndmsg_off;
978 unsigned int left;
979
980 if (page && (left = PAGE_SIZE - off) > 0) {
981 if (copy >= left)
982 copy = left;
983 if (page != frag->page) {
984 if (i == MAX_SKB_FRAGS) {
985 err = -EMSGSIZE;
986 goto error;
987 }
988 get_page(page);
989 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
990 frag = &skb_shinfo(skb)->frags[i];
991 }
992 } else if (i < MAX_SKB_FRAGS) {
993 if (copy > PAGE_SIZE)
994 copy = PAGE_SIZE;
995 page = alloc_pages(sk->sk_allocation, 0);
996 if (page == NULL) {
997 err = -ENOMEM;
998 goto error;
999 }
1000 sk->sk_sndmsg_page = page;
1001 sk->sk_sndmsg_off = 0;
1002
1003 skb_fill_page_desc(skb, i, page, 0, 0);
1004 frag = &skb_shinfo(skb)->frags[i];
1005 skb->truesize += PAGE_SIZE;
1006 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1007 } else {
1008 err = -EMSGSIZE;
1009 goto error;
1010 }
1011 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1012 err = -EFAULT;
1013 goto error;
1014 }
1015 sk->sk_sndmsg_off += copy;
1016 frag->size += copy;
1017 skb->len += copy;
1018 skb->data_len += copy;
1019 }
1020 offset += copy;
1021 length -= copy;
1022 }
1023
1024 return 0;
1025
1026error:
1027 inet->cork.length -= length;
1028 IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1029 return err;
1030}
1031
1032ssize_t ip_append_page(struct sock *sk, struct page *page,
1033 int offset, size_t size, int flags)
1034{
1035 struct inet_sock *inet = inet_sk(sk);
1036 struct sk_buff *skb;
1037 struct rtable *rt;
1038 struct ip_options *opt = NULL;
1039 int hh_len;
1040 int mtu;
1041 int len;
1042 int err;
1043 unsigned int maxfraglen, fragheaderlen, fraggap;
1044
1045 if (inet->hdrincl)
1046 return -EPERM;
1047
1048 if (flags&MSG_PROBE)
1049 return 0;
1050
1051 if (skb_queue_empty(&sk->sk_write_queue))
1052 return -EINVAL;
1053
1054 rt = inet->cork.rt;
1055 if (inet->cork.flags & IPCORK_OPT)
1056 opt = inet->cork.opt;
1057
1058 if (!(rt->u.dst.dev->features&NETIF_F_SG))
1059 return -EOPNOTSUPP;
1060
1061 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1062 mtu = inet->cork.fragsize;
1063
1064 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1065 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1066
1067 if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1068 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1069 return -EMSGSIZE;
1070 }
1071
1072 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1073 return -EINVAL;
1074
1075 inet->cork.length += size;
Ananda Rajue89e9cf2005-10-18 15:46:41 -07001076 if ((sk->sk_protocol == IPPROTO_UDP) &&
1077 (rt->u.dst.dev->features & NETIF_F_UFO))
1078 skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen);
1079
Linus Torvalds1da177e2005-04-16 15:20:36 -07001080
1081 while (size > 0) {
1082 int i;
1083
Ananda Rajue89e9cf2005-10-18 15:46:41 -07001084 if (skb_shinfo(skb)->ufo_size)
1085 len = size;
1086 else {
1087
1088 /* Check if the remaining data fits into current packet. */
1089 len = mtu - skb->len;
1090 if (len < size)
1091 len = maxfraglen - skb->len;
1092 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001093 if (len <= 0) {
1094 struct sk_buff *skb_prev;
1095 char *data;
1096 struct iphdr *iph;
1097 int alloclen;
1098
1099 skb_prev = skb;
Jayachandran C0d0d2bb2005-10-13 11:43:02 -07001100 fraggap = skb_prev->len - maxfraglen;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001101
1102 alloclen = fragheaderlen + hh_len + fraggap + 15;
1103 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1104 if (unlikely(!skb)) {
1105 err = -ENOBUFS;
1106 goto error;
1107 }
1108
1109 /*
1110 * Fill in the control structures
1111 */
1112 skb->ip_summed = CHECKSUM_NONE;
1113 skb->csum = 0;
1114 skb_reserve(skb, hh_len);
1115
1116 /*
1117 * Find where to start putting bytes.
1118 */
1119 data = skb_put(skb, fragheaderlen + fraggap);
1120 skb->nh.iph = iph = (struct iphdr *)data;
1121 data += fragheaderlen;
1122 skb->h.raw = data;
1123
1124 if (fraggap) {
1125 skb->csum = skb_copy_and_csum_bits(
1126 skb_prev, maxfraglen,
1127 data, fraggap, 0);
1128 skb_prev->csum = csum_sub(skb_prev->csum,
1129 skb->csum);
1130 skb_trim(skb_prev, maxfraglen);
1131 }
1132
1133 /*
1134 * Put the packet on the pending queue.
1135 */
1136 __skb_queue_tail(&sk->sk_write_queue, skb);
1137 continue;
1138 }
1139
1140 i = skb_shinfo(skb)->nr_frags;
1141 if (len > size)
1142 len = size;
1143 if (skb_can_coalesce(skb, i, page, offset)) {
1144 skb_shinfo(skb)->frags[i-1].size += len;
1145 } else if (i < MAX_SKB_FRAGS) {
1146 get_page(page);
1147 skb_fill_page_desc(skb, i, page, offset, len);
1148 } else {
1149 err = -EMSGSIZE;
1150 goto error;
1151 }
1152
1153 if (skb->ip_summed == CHECKSUM_NONE) {
1154 unsigned int csum;
1155 csum = csum_page(page, offset, len);
1156 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1157 }
1158
1159 skb->len += len;
1160 skb->data_len += len;
1161 offset += len;
1162 size -= len;
1163 }
1164 return 0;
1165
1166error:
1167 inet->cork.length -= size;
1168 IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1169 return err;
1170}
1171
1172/*
1173 * Combined all pending IP fragments on the socket as one IP datagram
1174 * and push them out.
1175 */
1176int ip_push_pending_frames(struct sock *sk)
1177{
1178 struct sk_buff *skb, *tmp_skb;
1179 struct sk_buff **tail_skb;
1180 struct inet_sock *inet = inet_sk(sk);
1181 struct ip_options *opt = NULL;
1182 struct rtable *rt = inet->cork.rt;
1183 struct iphdr *iph;
1184 int df = 0;
1185 __u8 ttl;
1186 int err = 0;
1187
1188 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1189 goto out;
1190 tail_skb = &(skb_shinfo(skb)->frag_list);
1191
1192 /* move skb->data to ip header from ext header */
1193 if (skb->data < skb->nh.raw)
1194 __skb_pull(skb, skb->nh.raw - skb->data);
1195 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1196 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1197 *tail_skb = tmp_skb;
1198 tail_skb = &(tmp_skb->next);
1199 skb->len += tmp_skb->len;
1200 skb->data_len += tmp_skb->len;
1201 skb->truesize += tmp_skb->truesize;
1202 __sock_put(tmp_skb->sk);
1203 tmp_skb->destructor = NULL;
1204 tmp_skb->sk = NULL;
1205 }
1206
1207 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1208 * to fragment the frame generated here. No matter, what transforms
1209 * how transforms change size of the packet, it will come out.
1210 */
1211 if (inet->pmtudisc != IP_PMTUDISC_DO)
1212 skb->local_df = 1;
1213
1214 /* DF bit is set when we want to see DF on outgoing frames.
1215 * If local_df is set too, we still allow to fragment this frame
1216 * locally. */
1217 if (inet->pmtudisc == IP_PMTUDISC_DO ||
1218 (skb->len <= dst_mtu(&rt->u.dst) &&
1219 ip_dont_fragment(sk, &rt->u.dst)))
1220 df = htons(IP_DF);
1221
1222 if (inet->cork.flags & IPCORK_OPT)
1223 opt = inet->cork.opt;
1224
1225 if (rt->rt_type == RTN_MULTICAST)
1226 ttl = inet->mc_ttl;
1227 else
1228 ttl = ip_select_ttl(inet, &rt->u.dst);
1229
1230 iph = (struct iphdr *)skb->data;
1231 iph->version = 4;
1232 iph->ihl = 5;
1233 if (opt) {
1234 iph->ihl += opt->optlen>>2;
1235 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1236 }
1237 iph->tos = inet->tos;
1238 iph->tot_len = htons(skb->len);
1239 iph->frag_off = df;
1240 if (!df) {
1241 __ip_select_ident(iph, &rt->u.dst, 0);
1242 } else {
1243 iph->id = htons(inet->id++);
1244 }
1245 iph->ttl = ttl;
1246 iph->protocol = sk->sk_protocol;
1247 iph->saddr = rt->rt_src;
1248 iph->daddr = rt->rt_dst;
1249 ip_send_check(iph);
1250
1251 skb->priority = sk->sk_priority;
1252 skb->dst = dst_clone(&rt->u.dst);
1253
1254 /* Netfilter gets whole the not fragmented skb. */
1255 err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
1256 skb->dst->dev, dst_output);
1257 if (err) {
1258 if (err > 0)
1259 err = inet->recverr ? net_xmit_errno(err) : 0;
1260 if (err)
1261 goto error;
1262 }
1263
1264out:
1265 inet->cork.flags &= ~IPCORK_OPT;
Jesper Juhla51482b2005-11-08 09:41:34 -08001266 kfree(inet->cork.opt);
1267 inet->cork.opt = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001268 if (inet->cork.rt) {
1269 ip_rt_put(inet->cork.rt);
1270 inet->cork.rt = NULL;
1271 }
1272 return err;
1273
1274error:
1275 IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1276 goto out;
1277}
1278
1279/*
1280 * Throw away all pending data on the socket.
1281 */
1282void ip_flush_pending_frames(struct sock *sk)
1283{
1284 struct inet_sock *inet = inet_sk(sk);
1285 struct sk_buff *skb;
1286
1287 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1288 kfree_skb(skb);
1289
1290 inet->cork.flags &= ~IPCORK_OPT;
Jesper Juhla51482b2005-11-08 09:41:34 -08001291 kfree(inet->cork.opt);
1292 inet->cork.opt = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001293 if (inet->cork.rt) {
1294 ip_rt_put(inet->cork.rt);
1295 inet->cork.rt = NULL;
1296 }
1297}
1298
1299
1300/*
1301 * Fetch data from kernel space and fill in checksum if needed.
1302 */
1303static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1304 int len, int odd, struct sk_buff *skb)
1305{
1306 unsigned int csum;
1307
1308 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1309 skb->csum = csum_block_add(skb->csum, csum, odd);
1310 return 0;
1311}
1312
1313/*
1314 * Generic function to send a packet as reply to another packet.
1315 * Used to send TCP resets so far. ICMP should use this function too.
1316 *
1317 * Should run single threaded per socket because it uses the sock
1318 * structure to pass arguments.
1319 *
1320 * LATER: switch from ip_build_xmit to ip_append_*
1321 */
1322void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1323 unsigned int len)
1324{
1325 struct inet_sock *inet = inet_sk(sk);
1326 struct {
1327 struct ip_options opt;
1328 char data[40];
1329 } replyopts;
1330 struct ipcm_cookie ipc;
1331 u32 daddr;
1332 struct rtable *rt = (struct rtable*)skb->dst;
1333
1334 if (ip_options_echo(&replyopts.opt, skb))
1335 return;
1336
1337 daddr = ipc.addr = rt->rt_src;
1338 ipc.opt = NULL;
1339
1340 if (replyopts.opt.optlen) {
1341 ipc.opt = &replyopts.opt;
1342
1343 if (ipc.opt->srr)
1344 daddr = replyopts.opt.faddr;
1345 }
1346
1347 {
1348 struct flowi fl = { .nl_u = { .ip4_u =
1349 { .daddr = daddr,
1350 .saddr = rt->rt_spec_dst,
1351 .tos = RT_TOS(skb->nh.iph->tos) } },
1352 /* Not quite clean, but right. */
1353 .uli_u = { .ports =
1354 { .sport = skb->h.th->dest,
1355 .dport = skb->h.th->source } },
1356 .proto = sk->sk_protocol };
1357 if (ip_route_output_key(&rt, &fl))
1358 return;
1359 }
1360
1361 /* And let IP do all the hard work.
1362
1363 This chunk is not reenterable, hence spinlock.
1364 Note that it uses the fact, that this function is called
1365 with locally disabled BH and that sk cannot be already spinlocked.
1366 */
1367 bh_lock_sock(sk);
1368 inet->tos = skb->nh.iph->tos;
1369 sk->sk_priority = skb->priority;
1370 sk->sk_protocol = skb->nh.iph->protocol;
1371 ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1372 &ipc, rt, MSG_DONTWAIT);
1373 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1374 if (arg->csumoffset >= 0)
1375 *((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
1376 skb->ip_summed = CHECKSUM_NONE;
1377 ip_push_pending_frames(sk);
1378 }
1379
1380 bh_unlock_sock(sk);
1381
1382 ip_rt_put(rt);
1383}
1384
Linus Torvalds1da177e2005-04-16 15:20:36 -07001385void __init ip_init(void)
1386{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001387 ip_rt_init();
1388 inet_initpeers();
1389
1390#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1391 igmp_mc_proc_init();
1392#endif
1393}
1394
Linus Torvalds1da177e2005-04-16 15:20:36 -07001395EXPORT_SYMBOL(ip_fragment);
1396EXPORT_SYMBOL(ip_generic_getfrag);
1397EXPORT_SYMBOL(ip_queue_xmit);
1398EXPORT_SYMBOL(ip_send_check);