blob: 362e66f7d2fbbd6856c67997dd3301e649191294 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * The Internet Protocol (IP) output module.
7 *
Jesper Juhl02c30a82005-05-05 16:16:16 -07008 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -07009 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Donald Becker, <becker@super.org>
11 * Alan Cox, <Alan.Cox@linux.org>
12 * Richard Underwood
13 * Stefan Becker, <stefanb@yello.ping.de>
14 * Jorge Cwik, <jorge@laser.satlink.net>
15 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 * Hirokazu Takahashi, <taka@valinux.co.jp>
17 *
18 * See ip_input.c for original log
19 *
20 * Fixes:
21 * Alan Cox : Missing nonblock feature in ip_build_xmit.
22 * Mike Kilburn : htons() missing in ip_build_xmit.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090023 * Bradford Johnson: Fix faulty handling of some frames when
Linus Torvalds1da177e2005-04-16 15:20:36 -070024 * no route is found.
25 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
26 * (in case if packet not accepted by
27 * output firewall rules)
28 * Mike McLagan : Routing by source
29 * Alexey Kuznetsov: use new route cache
30 * Andi Kleen: Fix broken PMTU recovery and remove
31 * some redundant tests.
32 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
33 * Andi Kleen : Replace ip_reply with ip_send_reply.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090034 * Andi Kleen : Split fast and slow ip_build_xmit path
35 * for decreased register pressure on x86
36 * and more readibility.
Linus Torvalds1da177e2005-04-16 15:20:36 -070037 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
38 * silently drop skb instead of failing with -EPERM.
39 * Detlev Wengorz : Copy protocol for fragments.
40 * Hirokazu Takahashi: HW checksumming for outgoing UDP
41 * datagrams.
42 * Hirokazu Takahashi: sendfile() on UDP works now.
43 */
44
45#include <asm/uaccess.h>
46#include <asm/system.h>
47#include <linux/module.h>
48#include <linux/types.h>
49#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070050#include <linux/mm.h>
51#include <linux/string.h>
52#include <linux/errno.h>
Al Viroa1f8e7f72006-10-19 16:08:53 -040053#include <linux/highmem.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090054#include <linux/slab.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070055
56#include <linux/socket.h>
57#include <linux/sockios.h>
58#include <linux/in.h>
59#include <linux/inet.h>
60#include <linux/netdevice.h>
61#include <linux/etherdevice.h>
62#include <linux/proc_fs.h>
63#include <linux/stat.h>
64#include <linux/init.h>
65
66#include <net/snmp.h>
67#include <net/ip.h>
68#include <net/protocol.h>
69#include <net/route.h>
Patrick McHardycfacb052006-01-08 22:36:54 -080070#include <net/xfrm.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070071#include <linux/skbuff.h>
72#include <net/sock.h>
73#include <net/arp.h>
74#include <net/icmp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070075#include <net/checksum.h>
76#include <net/inetpeer.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070077#include <linux/igmp.h>
78#include <linux/netfilter_ipv4.h>
79#include <linux/netfilter_bridge.h>
80#include <linux/mroute.h>
81#include <linux/netlink.h>
Arnaldo Carvalho de Melo6cbb0df2005-08-09 19:49:02 -070082#include <linux/tcp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070083
Brian Haleyab32ea52006-09-22 14:15:41 -070084int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
David S. Miller323e1262010-12-12 21:55:08 -080085EXPORT_SYMBOL(sysctl_ip_default_ttl);
Linus Torvalds1da177e2005-04-16 15:20:36 -070086
87/* Generate a checksum for an outgoing IP datagram. */
88__inline__ void ip_send_check(struct iphdr *iph)
89{
90 iph->check = 0;
91 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
92}
Eric Dumazet4bc2f182010-07-09 21:22:10 +000093EXPORT_SYMBOL(ip_send_check);
Linus Torvalds1da177e2005-04-16 15:20:36 -070094
Herbert Xuc439cb22008-01-11 19:14:00 -080095int __ip_local_out(struct sk_buff *skb)
96{
97 struct iphdr *iph = ip_hdr(skb);
98
99 iph->tot_len = htons(skb->len);
100 ip_send_check(iph);
Jan Engelhardt9bbc7682010-03-23 04:07:29 +0100101 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
102 skb_dst(skb)->dev, dst_output);
Herbert Xuc439cb22008-01-11 19:14:00 -0800103}
104
105int ip_local_out(struct sk_buff *skb)
106{
107 int err;
108
109 err = __ip_local_out(skb);
110 if (likely(err == 1))
111 err = dst_output(skb);
112
113 return err;
114}
115EXPORT_SYMBOL_GPL(ip_local_out);
116
Linus Torvalds1da177e2005-04-16 15:20:36 -0700117/* dev_loopback_xmit for use with netfilter. */
118static int ip_dev_loopback_xmit(struct sk_buff *newskb)
119{
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -0700120 skb_reset_mac_header(newskb);
Arnaldo Carvalho de Melobbe735e2007-03-10 22:16:10 -0300121 __skb_pull(newskb, skb_network_offset(newskb));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700122 newskb->pkt_type = PACKET_LOOPBACK;
123 newskb->ip_summed = CHECKSUM_UNNECESSARY;
Eric Dumazetadf30902009-06-02 05:19:30 +0000124 WARN_ON(!skb_dst(newskb));
Eric Dumazete30b38c2010-04-15 09:13:03 +0000125 netif_rx_ni(newskb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700126 return 0;
127}
128
129static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
130{
131 int ttl = inet->uc_ttl;
132
133 if (ttl < 0)
David S. Miller323e1262010-12-12 21:55:08 -0800134 ttl = ip4_dst_hoplimit(dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700135 return ttl;
136}
137
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900138/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700139 * Add an ip header to a skbuff and send it out.
140 *
141 */
142int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000143 __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700144{
145 struct inet_sock *inet = inet_sk(sk);
Eric Dumazet511c3f92009-06-02 05:14:27 +0000146 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700147 struct iphdr *iph;
148
149 /* Build the IP header. */
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000150 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
Arnaldo Carvalho de Melo8856dfa2007-03-10 19:40:39 -0300151 skb_reset_network_header(skb);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700152 iph = ip_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700153 iph->version = 4;
154 iph->ihl = 5;
155 iph->tos = inet->tos;
Changli Gaod8d1f302010-06-10 23:31:35 -0700156 if (ip_dont_fragment(sk, &rt->dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700157 iph->frag_off = htons(IP_DF);
158 else
159 iph->frag_off = 0;
Changli Gaod8d1f302010-06-10 23:31:35 -0700160 iph->ttl = ip_select_ttl(inet, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700161 iph->daddr = rt->rt_dst;
162 iph->saddr = rt->rt_src;
163 iph->protocol = sk->sk_protocol;
Changli Gaod8d1f302010-06-10 23:31:35 -0700164 ip_select_ident(iph, &rt->dst, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700165
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000166 if (opt && opt->opt.optlen) {
167 iph->ihl += opt->opt.optlen>>2;
168 ip_options_build(skb, &opt->opt, daddr, rt, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700169 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700170
171 skb->priority = sk->sk_priority;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800172 skb->mark = sk->sk_mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700173
174 /* Send it out. */
Herbert Xuc439cb22008-01-11 19:14:00 -0800175 return ip_local_out(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700176}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -0700177EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
178
Linus Torvalds1da177e2005-04-16 15:20:36 -0700179static inline int ip_finish_output2(struct sk_buff *skb)
180{
Eric Dumazetadf30902009-06-02 05:19:30 +0000181 struct dst_entry *dst = skb_dst(skb);
Mitsuru Chinen80787eb2007-04-30 00:48:20 -0700182 struct rtable *rt = (struct rtable *)dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700183 struct net_device *dev = dst->dev;
Chuck Leverc2636b42007-10-23 21:07:32 -0700184 unsigned int hh_len = LL_RESERVED_SPACE(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700185
Neil Hormanedf391f2009-04-27 02:45:02 -0700186 if (rt->rt_type == RTN_MULTICAST) {
187 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
188 } else if (rt->rt_type == RTN_BROADCAST)
189 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
Mitsuru Chinen80787eb2007-04-30 00:48:20 -0700190
Linus Torvalds1da177e2005-04-16 15:20:36 -0700191 /* Be paranoid, rather than too clever. */
Stephen Hemminger3b04ddd2007-10-09 01:40:57 -0700192 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700193 struct sk_buff *skb2;
194
195 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
196 if (skb2 == NULL) {
197 kfree_skb(skb);
198 return -ENOMEM;
199 }
200 if (skb->sk)
201 skb_set_owner_w(skb2, skb->sk);
202 kfree_skb(skb);
203 skb = skb2;
204 }
205
Stephen Hemminger3644f0c2006-12-07 15:08:17 -0800206 if (dst->hh)
207 return neigh_hh_output(dst->hh, skb);
208 else if (dst->neighbour)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700209 return dst->neighbour->output(skb);
210
211 if (net_ratelimit())
212 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
213 kfree_skb(skb);
214 return -EINVAL;
215}
216
John Heffner628a5c52007-04-20 15:53:27 -0700217static inline int ip_skb_dst_mtu(struct sk_buff *skb)
218{
219 struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
220
221 return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
Eric Dumazetadf30902009-06-02 05:19:30 +0000222 skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
John Heffner628a5c52007-04-20 15:53:27 -0700223}
224
Patrick McHardy861d0482007-10-15 01:48:39 -0700225static int ip_finish_output(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700226{
Patrick McHardy5c901da2006-01-06 23:05:36 -0800227#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
228 /* Policy lookup after SNAT yielded a new policy */
Eric Dumazetadf30902009-06-02 05:19:30 +0000229 if (skb_dst(skb)->xfrm != NULL) {
Patrick McHardy48d5cad2006-02-15 15:10:22 -0800230 IPCB(skb)->flags |= IPSKB_REROUTED;
231 return dst_output(skb);
232 }
Patrick McHardy5c901da2006-01-06 23:05:36 -0800233#endif
John Heffner628a5c52007-04-20 15:53:27 -0700234 if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
Patrick McHardy1bd9bef2006-01-05 12:20:59 -0800235 return ip_fragment(skb, ip_finish_output2);
236 else
237 return ip_finish_output2(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700238}
239
240int ip_mc_output(struct sk_buff *skb)
241{
242 struct sock *sk = skb->sk;
Eric Dumazet511c3f92009-06-02 05:14:27 +0000243 struct rtable *rt = skb_rtable(skb);
Changli Gaod8d1f302010-06-10 23:31:35 -0700244 struct net_device *dev = rt->dst.dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700245
246 /*
247 * If the indicated interface is up and running, send the packet.
248 */
Neil Hormanedf391f2009-04-27 02:45:02 -0700249 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700250
251 skb->dev = dev;
252 skb->protocol = htons(ETH_P_IP);
253
254 /*
255 * Multicasts are looped back for other local users
256 */
257
258 if (rt->rt_flags&RTCF_MULTICAST) {
Octavian Purdila7ad68482010-01-06 20:37:01 -0800259 if (sk_mc_loop(sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700260#ifdef CONFIG_IP_MROUTE
261 /* Small optimization: do not loopback not local frames,
262 which returned after forwarding; they will be dropped
263 by ip_mr_input in any case.
264 Note, that local frames are looped back to be delivered
265 to local recipients.
266
267 This check is duplicated in ip_mr_input at the moment.
268 */
Joe Perches9d4fb272009-11-23 10:41:23 -0800269 &&
270 ((rt->rt_flags & RTCF_LOCAL) ||
271 !(IPCB(skb)->flags & IPSKB_FORWARDED))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700272#endif
Joe Perches9d4fb272009-11-23 10:41:23 -0800273 ) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700274 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
275 if (newskb)
Jan Engelhardt9bbc7682010-03-23 04:07:29 +0100276 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
277 newskb, NULL, newskb->dev,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700278 ip_dev_loopback_xmit);
279 }
280
281 /* Multicasts with ttl 0 must not go beyond the host */
282
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700283 if (ip_hdr(skb)->ttl == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700284 kfree_skb(skb);
285 return 0;
286 }
287 }
288
289 if (rt->rt_flags&RTCF_BROADCAST) {
290 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
291 if (newskb)
Jan Engelhardt9bbc7682010-03-23 04:07:29 +0100292 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
293 NULL, newskb->dev, ip_dev_loopback_xmit);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700294 }
295
Jan Engelhardt9bbc7682010-03-23 04:07:29 +0100296 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
297 skb->dev, ip_finish_output,
Patrick McHardy48d5cad2006-02-15 15:10:22 -0800298 !(IPCB(skb)->flags & IPSKB_REROUTED));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700299}
300
301int ip_output(struct sk_buff *skb)
302{
Eric Dumazetadf30902009-06-02 05:19:30 +0000303 struct net_device *dev = skb_dst(skb)->dev;
Patrick McHardy1bd9bef2006-01-05 12:20:59 -0800304
Neil Hormanedf391f2009-04-27 02:45:02 -0700305 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700306
Patrick McHardy1bd9bef2006-01-05 12:20:59 -0800307 skb->dev = dev;
308 skb->protocol = htons(ETH_P_IP);
309
Jan Engelhardt9bbc7682010-03-23 04:07:29 +0100310 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900311 ip_finish_output,
Patrick McHardy48d5cad2006-02-15 15:10:22 -0800312 !(IPCB(skb)->flags & IPSKB_REROUTED));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700313}
314
Shan Wei4e15ed42010-04-15 16:43:08 +0000315int ip_queue_xmit(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700316{
David S. Millere89862f2007-01-26 01:04:55 -0800317 struct sock *sk = skb->sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700318 struct inet_sock *inet = inet_sk(sk);
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000319 struct ip_options_rcu *inet_opt;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700320 struct rtable *rt;
321 struct iphdr *iph;
Eric Dumazetab6e3fe2010-05-10 11:31:49 +0000322 int res;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700323
324 /* Skip all of this if the packet is already routed,
325 * f.e. by something like SCTP.
326 */
Eric Dumazetab6e3fe2010-05-10 11:31:49 +0000327 rcu_read_lock();
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000328 inet_opt = rcu_dereference(inet->inet_opt);
Eric Dumazet511c3f92009-06-02 05:14:27 +0000329 rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700330 if (rt != NULL)
331 goto packet_routed;
332
333 /* Make sure we can route this packet. */
334 rt = (struct rtable *)__sk_dst_check(sk, 0);
335 if (rt == NULL) {
Al Viro3ca3c682006-09-27 18:28:07 -0700336 __be32 daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700337
338 /* Use correct destination address if we have options. */
Eric Dumazetc720c7e82009-10-15 06:30:45 +0000339 daddr = inet->inet_daddr;
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000340 if (inet_opt && inet_opt->opt.srr)
341 daddr = inet_opt->opt.faddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700342
David S. Miller78fbfd82011-03-12 00:00:52 -0500343 /* If this fails, retransmit mechanism of transport layer will
344 * keep trying until route appears or the connection times
345 * itself out.
346 */
347 rt = ip_route_output_ports(sock_net(sk), sk,
348 daddr, inet->inet_saddr,
349 inet->inet_dport,
350 inet->inet_sport,
351 sk->sk_protocol,
352 RT_CONN_FLAGS(sk),
353 sk->sk_bound_dev_if);
354 if (IS_ERR(rt))
355 goto no_route;
Changli Gaod8d1f302010-06-10 23:31:35 -0700356 sk_setup_caps(sk, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700357 }
Changli Gaod8d1f302010-06-10 23:31:35 -0700358 skb_dst_set_noref(skb, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700359
360packet_routed:
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000361 if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_dst != rt->rt_gateway)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700362 goto no_route;
363
364 /* OK, we know where to send it, allocate and build IP header. */
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000365 skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
Arnaldo Carvalho de Melo8856dfa2007-03-10 19:40:39 -0300366 skb_reset_network_header(skb);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700367 iph = ip_hdr(skb);
Al Viro714e85b2006-11-14 20:51:49 -0800368 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
Changli Gaod8d1f302010-06-10 23:31:35 -0700369 if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700370 iph->frag_off = htons(IP_DF);
371 else
372 iph->frag_off = 0;
Changli Gaod8d1f302010-06-10 23:31:35 -0700373 iph->ttl = ip_select_ttl(inet, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700374 iph->protocol = sk->sk_protocol;
375 iph->saddr = rt->rt_src;
376 iph->daddr = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700377 /* Transport layer set skb->h.foo itself. */
378
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000379 if (inet_opt && inet_opt->opt.optlen) {
380 iph->ihl += inet_opt->opt.optlen >> 2;
381 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700382 }
383
Changli Gaod8d1f302010-06-10 23:31:35 -0700384 ip_select_ident_more(iph, &rt->dst, sk,
Herbert Xu79671682006-06-22 02:40:14 -0700385 (skb_shinfo(skb)->gso_segs ?: 1) - 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700386
Linus Torvalds1da177e2005-04-16 15:20:36 -0700387 skb->priority = sk->sk_priority;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800388 skb->mark = sk->sk_mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700389
Eric Dumazetab6e3fe2010-05-10 11:31:49 +0000390 res = ip_local_out(skb);
391 rcu_read_unlock();
392 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700393
394no_route:
Eric Dumazetab6e3fe2010-05-10 11:31:49 +0000395 rcu_read_unlock();
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700396 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700397 kfree_skb(skb);
398 return -EHOSTUNREACH;
399}
Eric Dumazet4bc2f182010-07-09 21:22:10 +0000400EXPORT_SYMBOL(ip_queue_xmit);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700401
402
403static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
404{
405 to->pkt_type = from->pkt_type;
406 to->priority = from->priority;
407 to->protocol = from->protocol;
Eric Dumazetadf30902009-06-02 05:19:30 +0000408 skb_dst_drop(to);
Eric Dumazetfe76cda2010-07-01 23:48:22 +0000409 skb_dst_copy(to, from);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700410 to->dev = from->dev;
Thomas Graf82e91ff2006-11-09 15:19:14 -0800411 to->mark = from->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700412
413 /* Copy the flags to each fragment. */
414 IPCB(to)->flags = IPCB(from)->flags;
415
416#ifdef CONFIG_NET_SCHED
417 to->tc_index = from->tc_index;
418#endif
Yasuyuki Kozakaie7ac05f2007-03-14 16:44:01 -0700419 nf_copy(to, from);
Jozsef Kadlecsikba9dda32007-07-07 22:21:23 -0700420#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
421 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
422 to->nf_trace = from->nf_trace;
423#endif
Julian Anastasovc98d80e2005-10-22 13:39:21 +0300424#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
425 to->ipvs_property = from->ipvs_property;
426#endif
James Morris984bc162006-06-09 00:29:17 -0700427 skb_copy_secmark(to, from);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700428}
429
430/*
431 * This IP datagram is too large to be sent in one piece. Break it up into
432 * smaller pieces (each of size equal to IP header plus
433 * a block of the data of the original IP data part) that will yet fit in a
434 * single device frame, and queue such a frame for sending.
435 */
436
Jianjun Kongd93191002008-11-03 00:23:42 -0800437int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700438{
439 struct iphdr *iph;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700440 int ptr;
441 struct net_device *dev;
442 struct sk_buff *skb2;
Changli Gaoc893b802010-07-31 13:25:08 +0000443 unsigned int mtu, hlen, left, len, ll_rs;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700444 int offset;
Alexey Dobriyan76ab6082006-01-06 13:24:29 -0800445 __be16 not_last_frag;
Eric Dumazet511c3f92009-06-02 05:14:27 +0000446 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700447 int err = 0;
448
Changli Gaod8d1f302010-06-10 23:31:35 -0700449 dev = rt->dst.dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700450
451 /*
452 * Point into the IP datagram header.
453 */
454
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700455 iph = ip_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700456
457 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700458 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700459 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
John Heffner628a5c52007-04-20 15:53:27 -0700460 htonl(ip_skb_dst_mtu(skb)));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700461 kfree_skb(skb);
462 return -EMSGSIZE;
463 }
464
465 /*
466 * Setup starting values.
467 */
468
469 hlen = iph->ihl * 4;
Changli Gaod8d1f302010-06-10 23:31:35 -0700470 mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
Bart De Schuymer6c79bf02010-04-20 16:22:01 +0200471#ifdef CONFIG_BRIDGE_NETFILTER
472 if (skb->nf_bridge)
473 mtu -= nf_bridge_mtu_reduction(skb);
474#endif
Herbert Xu89cee8b2005-12-13 23:14:27 -0800475 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700476
477 /* When frag_list is given, use it. First, check its validity:
478 * some transformers could create wrong frag_list or break existing
479 * one, it is not prohibited. In this case fall back to copying.
480 *
481 * LATER: this step can be merged to real generation of fragments,
482 * we can switch to copy when see the first bad fragment.
483 */
David S. Miller21dc3302010-08-23 00:13:46 -0700484 if (skb_has_frag_list(skb)) {
Eric Dumazet3d130082010-09-21 08:47:45 +0000485 struct sk_buff *frag, *frag2;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700486 int first_len = skb_pagelen(skb);
487
488 if (first_len - hlen > mtu ||
489 ((first_len - hlen) & 7) ||
490 (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
491 skb_cloned(skb))
492 goto slow_path;
493
David S. Millerd7fcf1a2009-06-09 00:19:37 -0700494 skb_walk_frags(skb, frag) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700495 /* Correct geometry. */
496 if (frag->len > mtu ||
497 ((frag->len & 7) && frag->next) ||
498 skb_headroom(frag) < hlen)
Eric Dumazet3d130082010-09-21 08:47:45 +0000499 goto slow_path_clean;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700500
501 /* Partially cloned skb? */
502 if (skb_shared(frag))
Eric Dumazet3d130082010-09-21 08:47:45 +0000503 goto slow_path_clean;
Herbert Xu2fdba6b2005-05-18 22:52:33 -0700504
505 BUG_ON(frag->sk);
506 if (skb->sk) {
Herbert Xu2fdba6b2005-05-18 22:52:33 -0700507 frag->sk = skb->sk;
508 frag->destructor = sock_wfree;
Herbert Xu2fdba6b2005-05-18 22:52:33 -0700509 }
Eric Dumazet3d130082010-09-21 08:47:45 +0000510 skb->truesize -= frag->truesize;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700511 }
512
513 /* Everything is OK. Generate! */
514
515 err = 0;
516 offset = 0;
517 frag = skb_shinfo(skb)->frag_list;
David S. Millerd7fcf1a2009-06-09 00:19:37 -0700518 skb_frag_list_init(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700519 skb->data_len = first_len - skb_headlen(skb);
520 skb->len = first_len;
521 iph->tot_len = htons(first_len);
522 iph->frag_off = htons(IP_MF);
523 ip_send_check(iph);
524
525 for (;;) {
526 /* Prepare header of the next frame,
527 * before previous one went down. */
528 if (frag) {
529 frag->ip_summed = CHECKSUM_NONE;
Arnaldo Carvalho de Melobadff6d2007-03-13 13:06:52 -0300530 skb_reset_transport_header(frag);
Arnaldo Carvalho de Meloe2d1bca2007-04-10 20:46:21 -0700531 __skb_push(frag, hlen);
532 skb_reset_network_header(frag);
Arnaldo Carvalho de Melod56f90a2007-04-10 20:50:43 -0700533 memcpy(skb_network_header(frag), iph, hlen);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700534 iph = ip_hdr(frag);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700535 iph->tot_len = htons(frag->len);
536 ip_copy_metadata(frag, skb);
537 if (offset == 0)
538 ip_options_fragment(frag);
539 offset += skb->len - hlen;
540 iph->frag_off = htons(offset>>3);
541 if (frag->next != NULL)
542 iph->frag_off |= htons(IP_MF);
543 /* Ready, complete checksum */
544 ip_send_check(iph);
545 }
546
547 err = output(skb);
548
Wei Dongdafee492006-08-02 13:41:21 -0700549 if (!err)
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700550 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700551 if (err || !frag)
552 break;
553
554 skb = frag;
555 frag = skb->next;
556 skb->next = NULL;
557 }
558
559 if (err == 0) {
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700560 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700561 return 0;
562 }
563
564 while (frag) {
565 skb = frag->next;
566 kfree_skb(frag);
567 frag = skb;
568 }
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700569 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700570 return err;
Eric Dumazet3d130082010-09-21 08:47:45 +0000571
572slow_path_clean:
573 skb_walk_frags(skb, frag2) {
574 if (frag2 == frag)
575 break;
576 frag2->sk = NULL;
577 frag2->destructor = NULL;
578 skb->truesize += frag2->truesize;
579 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700580 }
581
582slow_path:
583 left = skb->len - hlen; /* Space per frame */
George Kadianakis49085bd2010-07-06 11:44:12 +0000584 ptr = hlen; /* Where to start from */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700585
Linus Torvalds1da177e2005-04-16 15:20:36 -0700586 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
Stephen Hemminger9bcfcaf2006-08-29 17:48:57 -0700587 * we need to make room for the encapsulating header
588 */
Changli Gaoc893b802010-07-31 13:25:08 +0000589 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
Stephen Hemminger9bcfcaf2006-08-29 17:48:57 -0700590
Linus Torvalds1da177e2005-04-16 15:20:36 -0700591 /*
592 * Fragment the datagram.
593 */
594
595 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
596 not_last_frag = iph->frag_off & htons(IP_MF);
597
598 /*
599 * Keep copying data until we run out.
600 */
601
Stephen Hemminger132adf52007-03-08 20:44:43 -0800602 while (left > 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700603 len = left;
604 /* IF: it doesn't fit, use 'mtu' - the data space left */
605 if (len > mtu)
606 len = mtu;
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300607 /* IF: we are not sending up to and including the packet end
Linus Torvalds1da177e2005-04-16 15:20:36 -0700608 then align the next start on an eight byte boundary */
609 if (len < left) {
610 len &= ~7;
611 }
612 /*
613 * Allocate buffer.
614 */
615
616 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
Patrick McHardy64ce2072005-08-09 20:50:53 -0700617 NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700618 err = -ENOMEM;
619 goto fail;
620 }
621
622 /*
623 * Set up data on packet
624 */
625
626 ip_copy_metadata(skb2, skb);
627 skb_reserve(skb2, ll_rs);
628 skb_put(skb2, len + hlen);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -0700629 skb_reset_network_header(skb2);
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -0700630 skb2->transport_header = skb2->network_header + hlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700631
632 /*
633 * Charge the memory for the fragment to any owner
634 * it might possess
635 */
636
637 if (skb->sk)
638 skb_set_owner_w(skb2, skb->sk);
639
640 /*
641 * Copy the packet header into the new buffer.
642 */
643
Arnaldo Carvalho de Melod626f622007-03-27 18:55:52 -0300644 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700645
646 /*
647 * Copy a block of the IP datagram.
648 */
Arnaldo Carvalho de Melobff9b612007-03-16 17:19:57 -0300649 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700650 BUG();
651 left -= len;
652
653 /*
654 * Fill in the new header fields.
655 */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700656 iph = ip_hdr(skb2);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700657 iph->frag_off = htons((offset >> 3));
658
659 /* ANK: dirty, but effective trick. Upgrade options only if
660 * the segment to be fragmented was THE FIRST (otherwise,
661 * options are already fixed) and make it ONCE
662 * on the initial skb, so that all the following fragments
663 * will inherit fixed options.
664 */
665 if (offset == 0)
666 ip_options_fragment(skb);
667
668 /*
669 * Added AC : If we are fragmenting a fragment that's not the
670 * last fragment then keep MF on each bit
671 */
672 if (left > 0 || not_last_frag)
673 iph->frag_off |= htons(IP_MF);
674 ptr += len;
675 offset += len;
676
677 /*
678 * Put this fragment into the sending queue.
679 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700680 iph->tot_len = htons(len + hlen);
681
682 ip_send_check(iph);
683
684 err = output(skb2);
685 if (err)
686 goto fail;
Wei Dongdafee492006-08-02 13:41:21 -0700687
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700688 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700689 }
690 kfree_skb(skb);
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700691 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700692 return err;
693
694fail:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900695 kfree_skb(skb);
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700696 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700697 return err;
698}
Patrick McHardy2e2f7ae2006-04-04 13:42:35 -0700699EXPORT_SYMBOL(ip_fragment);
700
Linus Torvalds1da177e2005-04-16 15:20:36 -0700701int
702ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
703{
704 struct iovec *iov = from;
705
Patrick McHardy84fa7932006-08-29 16:44:56 -0700706 if (skb->ip_summed == CHECKSUM_PARTIAL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700707 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
708 return -EFAULT;
709 } else {
Al Viro44bb9362006-11-14 21:36:14 -0800710 __wsum csum = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700711 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
712 return -EFAULT;
713 skb->csum = csum_block_add(skb->csum, csum, odd);
714 }
715 return 0;
716}
Eric Dumazet4bc2f182010-07-09 21:22:10 +0000717EXPORT_SYMBOL(ip_generic_getfrag);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700718
Al Viro44bb9362006-11-14 21:36:14 -0800719static inline __wsum
Linus Torvalds1da177e2005-04-16 15:20:36 -0700720csum_page(struct page *page, int offset, int copy)
721{
722 char *kaddr;
Al Viro44bb9362006-11-14 21:36:14 -0800723 __wsum csum;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700724 kaddr = kmap(page);
725 csum = csum_partial(kaddr + offset, copy, 0);
726 kunmap(page);
727 return csum;
728}
729
Adrian Bunk4b30b1c2005-11-29 16:27:20 -0800730static inline int ip_ufo_append_data(struct sock *sk,
Herbert Xu1470ddf2011-03-01 02:36:47 +0000731 struct sk_buff_head *queue,
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700732 int getfrag(void *from, char *to, int offset, int len,
733 int odd, struct sk_buff *skb),
734 void *from, int length, int hh_len, int fragheaderlen,
Jianjun Kongd93191002008-11-03 00:23:42 -0800735 int transhdrlen, int mtu, unsigned int flags)
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700736{
737 struct sk_buff *skb;
738 int err;
739
740 /* There is support for UDP fragmentation offload by network
741 * device, so create one single skb packet containing complete
742 * udp datagram
743 */
Herbert Xu1470ddf2011-03-01 02:36:47 +0000744 if ((skb = skb_peek_tail(queue)) == NULL) {
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700745 skb = sock_alloc_send_skb(sk,
746 hh_len + fragheaderlen + transhdrlen + 20,
747 (flags & MSG_DONTWAIT), &err);
748
749 if (skb == NULL)
750 return err;
751
752 /* reserve space for Hardware header */
753 skb_reserve(skb, hh_len);
754
755 /* create space for UDP/IP header */
Jianjun Kongd93191002008-11-03 00:23:42 -0800756 skb_put(skb, fragheaderlen + transhdrlen);
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700757
758 /* initialize network header pointer */
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -0700759 skb_reset_network_header(skb);
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700760
761 /* initialize protocol header pointer */
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -0700762 skb->transport_header = skb->network_header + fragheaderlen;
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700763
Patrick McHardy84fa7932006-08-29 16:44:56 -0700764 skb->ip_summed = CHECKSUM_PARTIAL;
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700765 skb->csum = 0;
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700766
Kostya Bbe9164e2008-04-29 22:36:30 -0700767 /* specify the length of each IP datagram fragment */
Herbert Xu79671682006-06-22 02:40:14 -0700768 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
Herbert Xuf83ef8c2006-06-30 13:37:03 -0700769 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
Herbert Xu1470ddf2011-03-01 02:36:47 +0000770 __skb_queue_tail(queue, skb);
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700771 }
Kostya Bbe9164e2008-04-29 22:36:30 -0700772
773 return skb_append_datato_frags(sk, skb, getfrag, from,
774 (length - transhdrlen));
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700775}
776
Herbert Xu1470ddf2011-03-01 02:36:47 +0000777static int __ip_append_data(struct sock *sk, struct sk_buff_head *queue,
778 struct inet_cork *cork,
779 int getfrag(void *from, char *to, int offset,
780 int len, int odd, struct sk_buff *skb),
781 void *from, int length, int transhdrlen,
782 unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700783{
784 struct inet_sock *inet = inet_sk(sk);
785 struct sk_buff *skb;
786
Herbert Xu07df5292011-03-01 23:00:58 -0800787 struct ip_options *opt = cork->opt;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700788 int hh_len;
789 int exthdrlen;
790 int mtu;
791 int copy;
792 int err;
793 int offset = 0;
794 unsigned int maxfraglen, fragheaderlen;
795 int csummode = CHECKSUM_NONE;
Herbert Xu1470ddf2011-03-01 02:36:47 +0000796 struct rtable *rt = (struct rtable *)cork->dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700797
Herbert Xu1470ddf2011-03-01 02:36:47 +0000798 exthdrlen = transhdrlen ? rt->dst.header_len : 0;
799 length += exthdrlen;
800 transhdrlen += exthdrlen;
Herbert Xu07df5292011-03-01 23:00:58 -0800801 mtu = cork->fragsize;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700802
Changli Gaod8d1f302010-06-10 23:31:35 -0700803 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700804
805 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
806 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
807
Herbert Xu1470ddf2011-03-01 02:36:47 +0000808 if (cork->length + length > 0xFFFF - fragheaderlen) {
Eric Dumazetc720c7e82009-10-15 06:30:45 +0000809 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
810 mtu-exthdrlen);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700811 return -EMSGSIZE;
812 }
813
814 /*
815 * transhdrlen > 0 means that this is the first fragment and we wish
816 * it won't be fragmented in the future.
817 */
818 if (transhdrlen &&
819 length + fragheaderlen <= mtu &&
Changli Gaod8d1f302010-06-10 23:31:35 -0700820 rt->dst.dev->features & NETIF_F_V4_CSUM &&
Linus Torvalds1da177e2005-04-16 15:20:36 -0700821 !exthdrlen)
Patrick McHardy84fa7932006-08-29 16:44:56 -0700822 csummode = CHECKSUM_PARTIAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700823
Herbert Xu1470ddf2011-03-01 02:36:47 +0000824 skb = skb_peek_tail(queue);
Herbert Xu26cde9f2010-06-15 01:52:25 +0000825
Herbert Xu1470ddf2011-03-01 02:36:47 +0000826 cork->length += length;
Herbert Xu26cde9f2010-06-15 01:52:25 +0000827 if (((length > mtu) || (skb && skb_is_gso(skb))) &&
Kostya Bbe9164e2008-04-29 22:36:30 -0700828 (sk->sk_protocol == IPPROTO_UDP) &&
Changli Gaod8d1f302010-06-10 23:31:35 -0700829 (rt->dst.dev->features & NETIF_F_UFO)) {
Herbert Xu1470ddf2011-03-01 02:36:47 +0000830 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
831 hh_len, fragheaderlen, transhdrlen,
832 mtu, flags);
Patrick McHardybaa829d2006-03-12 20:35:12 -0800833 if (err)
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700834 goto error;
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700835 return 0;
836 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700837
838 /* So, what's going on in the loop below?
839 *
840 * We use calculated fragment length to generate chained skb,
841 * each of segments is IP fragment ready for sending to network after
842 * adding appropriate IP header.
843 */
844
Herbert Xu26cde9f2010-06-15 01:52:25 +0000845 if (!skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700846 goto alloc_new_skb;
847
848 while (length > 0) {
849 /* Check if the remaining data fits into current packet. */
850 copy = mtu - skb->len;
851 if (copy < length)
852 copy = maxfraglen - skb->len;
853 if (copy <= 0) {
854 char *data;
855 unsigned int datalen;
856 unsigned int fraglen;
857 unsigned int fraggap;
858 unsigned int alloclen;
859 struct sk_buff *skb_prev;
860alloc_new_skb:
861 skb_prev = skb;
862 if (skb_prev)
863 fraggap = skb_prev->len - maxfraglen;
864 else
865 fraggap = 0;
866
867 /*
868 * If remaining data exceeds the mtu,
869 * we know we need more fragment(s).
870 */
871 datalen = length + fraggap;
872 if (datalen > mtu - fragheaderlen)
873 datalen = maxfraglen - fragheaderlen;
874 fraglen = datalen + fragheaderlen;
875
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900876 if ((flags & MSG_MORE) &&
Changli Gaod8d1f302010-06-10 23:31:35 -0700877 !(rt->dst.dev->features&NETIF_F_SG))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700878 alloclen = mtu;
879 else
Eric Dumazet59104f02010-09-20 20:16:27 +0000880 alloclen = fraglen;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700881
882 /* The last fragment gets additional space at tail.
883 * Note, with MSG_MORE we overallocate on fragments,
884 * because we have no idea what fragment will be
885 * the last.
886 */
Eric Dumazet59104f02010-09-20 20:16:27 +0000887 if (datalen == length + fraggap) {
Changli Gaod8d1f302010-06-10 23:31:35 -0700888 alloclen += rt->dst.trailer_len;
Eric Dumazet59104f02010-09-20 20:16:27 +0000889 /* make sure mtu is not reached */
890 if (datalen > mtu - fragheaderlen - rt->dst.trailer_len)
891 datalen -= ALIGN(rt->dst.trailer_len, 8);
892 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700893 if (transhdrlen) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900894 skb = sock_alloc_send_skb(sk,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700895 alloclen + hh_len + 15,
896 (flags & MSG_DONTWAIT), &err);
897 } else {
898 skb = NULL;
899 if (atomic_read(&sk->sk_wmem_alloc) <=
900 2 * sk->sk_sndbuf)
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900901 skb = sock_wmalloc(sk,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700902 alloclen + hh_len + 15, 1,
903 sk->sk_allocation);
904 if (unlikely(skb == NULL))
905 err = -ENOBUFS;
Patrick Ohly51f31ca2009-02-12 05:03:39 +0000906 else
907 /* only the initial fragment is
908 time stamped */
Herbert Xu1470ddf2011-03-01 02:36:47 +0000909 cork->tx_flags = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700910 }
911 if (skb == NULL)
912 goto error;
913
914 /*
915 * Fill in the control structures
916 */
917 skb->ip_summed = csummode;
918 skb->csum = 0;
919 skb_reserve(skb, hh_len);
Herbert Xu1470ddf2011-03-01 02:36:47 +0000920 skb_shinfo(skb)->tx_flags = cork->tx_flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700921
922 /*
923 * Find where to start putting bytes.
924 */
925 data = skb_put(skb, fraglen);
Arnaldo Carvalho de Meloc14d2452007-03-11 22:39:41 -0300926 skb_set_network_header(skb, exthdrlen);
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -0700927 skb->transport_header = (skb->network_header +
928 fragheaderlen);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700929 data += fragheaderlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700930
931 if (fraggap) {
932 skb->csum = skb_copy_and_csum_bits(
933 skb_prev, maxfraglen,
934 data + transhdrlen, fraggap, 0);
935 skb_prev->csum = csum_sub(skb_prev->csum,
936 skb->csum);
937 data += fraggap;
Herbert Xue9fa4f72006-08-13 20:12:58 -0700938 pskb_trim_unique(skb_prev, maxfraglen);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700939 }
940
941 copy = datalen - transhdrlen - fraggap;
942 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
943 err = -EFAULT;
944 kfree_skb(skb);
945 goto error;
946 }
947
948 offset += copy;
949 length -= datalen - fraggap;
950 transhdrlen = 0;
951 exthdrlen = 0;
952 csummode = CHECKSUM_NONE;
953
954 /*
955 * Put the packet on the pending queue.
956 */
Herbert Xu1470ddf2011-03-01 02:36:47 +0000957 __skb_queue_tail(queue, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700958 continue;
959 }
960
961 if (copy > length)
962 copy = length;
963
Changli Gaod8d1f302010-06-10 23:31:35 -0700964 if (!(rt->dst.dev->features&NETIF_F_SG)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700965 unsigned int off;
966
967 off = skb->len;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900968 if (getfrag(from, skb_put(skb, copy),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700969 offset, copy, off, skb) < 0) {
970 __skb_trim(skb, off);
971 err = -EFAULT;
972 goto error;
973 }
974 } else {
975 int i = skb_shinfo(skb)->nr_frags;
976 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
Herbert Xu1470ddf2011-03-01 02:36:47 +0000977 struct page *page = cork->page;
978 int off = cork->off;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700979 unsigned int left;
980
981 if (page && (left = PAGE_SIZE - off) > 0) {
982 if (copy >= left)
983 copy = left;
984 if (page != frag->page) {
985 if (i == MAX_SKB_FRAGS) {
986 err = -EMSGSIZE;
987 goto error;
988 }
989 get_page(page);
Herbert Xu1470ddf2011-03-01 02:36:47 +0000990 skb_fill_page_desc(skb, i, page, off, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700991 frag = &skb_shinfo(skb)->frags[i];
992 }
993 } else if (i < MAX_SKB_FRAGS) {
994 if (copy > PAGE_SIZE)
995 copy = PAGE_SIZE;
996 page = alloc_pages(sk->sk_allocation, 0);
997 if (page == NULL) {
998 err = -ENOMEM;
999 goto error;
1000 }
Herbert Xu1470ddf2011-03-01 02:36:47 +00001001 cork->page = page;
1002 cork->off = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001003
1004 skb_fill_page_desc(skb, i, page, 0, 0);
1005 frag = &skb_shinfo(skb)->frags[i];
Linus Torvalds1da177e2005-04-16 15:20:36 -07001006 } else {
1007 err = -EMSGSIZE;
1008 goto error;
1009 }
1010 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1011 err = -EFAULT;
1012 goto error;
1013 }
Herbert Xu1470ddf2011-03-01 02:36:47 +00001014 cork->off += copy;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001015 frag->size += copy;
1016 skb->len += copy;
1017 skb->data_len += copy;
Herbert Xuf945fa72008-01-22 22:39:26 -08001018 skb->truesize += copy;
1019 atomic_add(copy, &sk->sk_wmem_alloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001020 }
1021 offset += copy;
1022 length -= copy;
1023 }
1024
1025 return 0;
1026
1027error:
Herbert Xu1470ddf2011-03-01 02:36:47 +00001028 cork->length -= length;
Pavel Emelyanov5e38e272008-07-16 20:19:49 -07001029 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001030 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001031}
1032
Herbert Xu1470ddf2011-03-01 02:36:47 +00001033static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1034 struct ipcm_cookie *ipc, struct rtable **rtp)
1035{
1036 struct inet_sock *inet = inet_sk(sk);
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001037 struct ip_options_rcu *opt;
Herbert Xu1470ddf2011-03-01 02:36:47 +00001038 struct rtable *rt;
1039
1040 /*
1041 * setup for corking.
1042 */
1043 opt = ipc->opt;
1044 if (opt) {
1045 if (cork->opt == NULL) {
1046 cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1047 sk->sk_allocation);
1048 if (unlikely(cork->opt == NULL))
1049 return -ENOBUFS;
1050 }
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001051 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
Herbert Xu1470ddf2011-03-01 02:36:47 +00001052 cork->flags |= IPCORK_OPT;
1053 cork->addr = ipc->addr;
1054 }
1055 rt = *rtp;
1056 if (unlikely(!rt))
1057 return -EFAULT;
1058 /*
1059 * We steal reference to this route, caller should not release it
1060 */
1061 *rtp = NULL;
1062 cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1063 rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1064 cork->dst = &rt->dst;
1065 cork->length = 0;
1066 cork->tx_flags = ipc->tx_flags;
1067 cork->page = NULL;
1068 cork->off = 0;
1069
1070 return 0;
1071}
1072
1073/*
1074 * ip_append_data() and ip_append_page() can make one large IP datagram
1075 * from many pieces of data. Each pieces will be holded on the socket
1076 * until ip_push_pending_frames() is called. Each piece can be a page
1077 * or non-page data.
1078 *
1079 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1080 * this interface potentially.
1081 *
1082 * LATER: length must be adjusted by pad at tail, when it is required.
1083 */
1084int ip_append_data(struct sock *sk,
1085 int getfrag(void *from, char *to, int offset, int len,
1086 int odd, struct sk_buff *skb),
1087 void *from, int length, int transhdrlen,
1088 struct ipcm_cookie *ipc, struct rtable **rtp,
1089 unsigned int flags)
1090{
1091 struct inet_sock *inet = inet_sk(sk);
1092 int err;
1093
1094 if (flags&MSG_PROBE)
1095 return 0;
1096
1097 if (skb_queue_empty(&sk->sk_write_queue)) {
1098 err = ip_setup_cork(sk, &inet->cork, ipc, rtp);
1099 if (err)
1100 return err;
1101 } else {
1102 transhdrlen = 0;
1103 }
1104
1105 return __ip_append_data(sk, &sk->sk_write_queue, &inet->cork, getfrag,
1106 from, length, transhdrlen, flags);
1107}
1108
Linus Torvalds1da177e2005-04-16 15:20:36 -07001109ssize_t ip_append_page(struct sock *sk, struct page *page,
1110 int offset, size_t size, int flags)
1111{
1112 struct inet_sock *inet = inet_sk(sk);
1113 struct sk_buff *skb;
1114 struct rtable *rt;
1115 struct ip_options *opt = NULL;
1116 int hh_len;
1117 int mtu;
1118 int len;
1119 int err;
1120 unsigned int maxfraglen, fragheaderlen, fraggap;
1121
1122 if (inet->hdrincl)
1123 return -EPERM;
1124
1125 if (flags&MSG_PROBE)
1126 return 0;
1127
1128 if (skb_queue_empty(&sk->sk_write_queue))
1129 return -EINVAL;
1130
YOSHIFUJI Hideakic8cdaf92008-03-10 04:30:37 -04001131 rt = (struct rtable *)inet->cork.dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001132 if (inet->cork.flags & IPCORK_OPT)
1133 opt = inet->cork.opt;
1134
Changli Gaod8d1f302010-06-10 23:31:35 -07001135 if (!(rt->dst.dev->features&NETIF_F_SG))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001136 return -EOPNOTSUPP;
1137
Changli Gaod8d1f302010-06-10 23:31:35 -07001138 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001139 mtu = inet->cork.fragsize;
1140
1141 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1142 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1143
1144 if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
Eric Dumazetc720c7e82009-10-15 06:30:45 +00001145 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001146 return -EMSGSIZE;
1147 }
1148
1149 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1150 return -EINVAL;
1151
1152 inet->cork.length += size;
Herbert Xu26cde9f2010-06-15 01:52:25 +00001153 if ((size + skb->len > mtu) &&
1154 (sk->sk_protocol == IPPROTO_UDP) &&
Changli Gaod8d1f302010-06-10 23:31:35 -07001155 (rt->dst.dev->features & NETIF_F_UFO)) {
Herbert Xu79671682006-06-22 02:40:14 -07001156 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
Herbert Xuf83ef8c2006-06-30 13:37:03 -07001157 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
Herbert Xu79671682006-06-22 02:40:14 -07001158 }
Ananda Rajue89e9cf2005-10-18 15:46:41 -07001159
Linus Torvalds1da177e2005-04-16 15:20:36 -07001160
1161 while (size > 0) {
1162 int i;
1163
Herbert Xu89114af2006-07-08 13:34:32 -07001164 if (skb_is_gso(skb))
Ananda Rajue89e9cf2005-10-18 15:46:41 -07001165 len = size;
1166 else {
1167
1168 /* Check if the remaining data fits into current packet. */
1169 len = mtu - skb->len;
1170 if (len < size)
1171 len = maxfraglen - skb->len;
1172 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001173 if (len <= 0) {
1174 struct sk_buff *skb_prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001175 int alloclen;
1176
1177 skb_prev = skb;
Jayachandran C0d0d2bb2005-10-13 11:43:02 -07001178 fraggap = skb_prev->len - maxfraglen;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001179
1180 alloclen = fragheaderlen + hh_len + fraggap + 15;
1181 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1182 if (unlikely(!skb)) {
1183 err = -ENOBUFS;
1184 goto error;
1185 }
1186
1187 /*
1188 * Fill in the control structures
1189 */
1190 skb->ip_summed = CHECKSUM_NONE;
1191 skb->csum = 0;
1192 skb_reserve(skb, hh_len);
1193
1194 /*
1195 * Find where to start putting bytes.
1196 */
Arnaldo Carvalho de Melo967b05f2007-03-13 13:51:52 -03001197 skb_put(skb, fragheaderlen + fraggap);
Arnaldo Carvalho de Melo2ca9e6f2007-03-10 19:15:25 -03001198 skb_reset_network_header(skb);
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -07001199 skb->transport_header = (skb->network_header +
1200 fragheaderlen);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001201 if (fraggap) {
Arnaldo Carvalho de Melo967b05f2007-03-13 13:51:52 -03001202 skb->csum = skb_copy_and_csum_bits(skb_prev,
1203 maxfraglen,
Arnaldo Carvalho de Melo9c702202007-04-25 18:04:18 -07001204 skb_transport_header(skb),
Arnaldo Carvalho de Melo967b05f2007-03-13 13:51:52 -03001205 fraggap, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001206 skb_prev->csum = csum_sub(skb_prev->csum,
1207 skb->csum);
Herbert Xue9fa4f72006-08-13 20:12:58 -07001208 pskb_trim_unique(skb_prev, maxfraglen);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001209 }
1210
1211 /*
1212 * Put the packet on the pending queue.
1213 */
1214 __skb_queue_tail(&sk->sk_write_queue, skb);
1215 continue;
1216 }
1217
1218 i = skb_shinfo(skb)->nr_frags;
1219 if (len > size)
1220 len = size;
1221 if (skb_can_coalesce(skb, i, page, offset)) {
1222 skb_shinfo(skb)->frags[i-1].size += len;
1223 } else if (i < MAX_SKB_FRAGS) {
1224 get_page(page);
1225 skb_fill_page_desc(skb, i, page, offset, len);
1226 } else {
1227 err = -EMSGSIZE;
1228 goto error;
1229 }
1230
1231 if (skb->ip_summed == CHECKSUM_NONE) {
Al Viro44bb9362006-11-14 21:36:14 -08001232 __wsum csum;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001233 csum = csum_page(page, offset, len);
1234 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1235 }
1236
1237 skb->len += len;
1238 skb->data_len += len;
David S. Miller1e34a112008-01-22 23:44:31 -08001239 skb->truesize += len;
1240 atomic_add(len, &sk->sk_wmem_alloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001241 offset += len;
1242 size -= len;
1243 }
1244 return 0;
1245
1246error:
1247 inet->cork.length -= size;
Pavel Emelyanov5e38e272008-07-16 20:19:49 -07001248 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001249 return err;
1250}
1251
Herbert Xu1470ddf2011-03-01 02:36:47 +00001252static void ip_cork_release(struct inet_cork *cork)
Pavel Emelyanov429f08e2007-11-05 21:03:24 -08001253{
Herbert Xu1470ddf2011-03-01 02:36:47 +00001254 cork->flags &= ~IPCORK_OPT;
1255 kfree(cork->opt);
1256 cork->opt = NULL;
1257 dst_release(cork->dst);
1258 cork->dst = NULL;
Pavel Emelyanov429f08e2007-11-05 21:03:24 -08001259}
1260
Linus Torvalds1da177e2005-04-16 15:20:36 -07001261/*
1262 * Combined all pending IP fragments on the socket as one IP datagram
1263 * and push them out.
1264 */
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001265struct sk_buff *__ip_make_skb(struct sock *sk,
1266 struct sk_buff_head *queue,
1267 struct inet_cork *cork)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001268{
1269 struct sk_buff *skb, *tmp_skb;
1270 struct sk_buff **tail_skb;
1271 struct inet_sock *inet = inet_sk(sk);
Pavel Emelyanov0388b002008-07-14 23:00:43 -07001272 struct net *net = sock_net(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001273 struct ip_options *opt = NULL;
Herbert Xu1470ddf2011-03-01 02:36:47 +00001274 struct rtable *rt = (struct rtable *)cork->dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001275 struct iphdr *iph;
Alexey Dobriyan76ab6082006-01-06 13:24:29 -08001276 __be16 df = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001277 __u8 ttl;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001278
Herbert Xu1470ddf2011-03-01 02:36:47 +00001279 if ((skb = __skb_dequeue(queue)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001280 goto out;
1281 tail_skb = &(skb_shinfo(skb)->frag_list);
1282
1283 /* move skb->data to ip header from ext header */
Arnaldo Carvalho de Melod56f90a2007-04-10 20:50:43 -07001284 if (skb->data < skb_network_header(skb))
Arnaldo Carvalho de Melobbe735e2007-03-10 22:16:10 -03001285 __skb_pull(skb, skb_network_offset(skb));
Herbert Xu1470ddf2011-03-01 02:36:47 +00001286 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
Arnaldo Carvalho de Melocfe1fc72007-03-16 17:26:39 -03001287 __skb_pull(tmp_skb, skb_network_header_len(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001288 *tail_skb = tmp_skb;
1289 tail_skb = &(tmp_skb->next);
1290 skb->len += tmp_skb->len;
1291 skb->data_len += tmp_skb->len;
1292 skb->truesize += tmp_skb->truesize;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001293 tmp_skb->destructor = NULL;
1294 tmp_skb->sk = NULL;
1295 }
1296
1297 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1298 * to fragment the frame generated here. No matter, what transforms
1299 * how transforms change size of the packet, it will come out.
1300 */
John Heffner628a5c52007-04-20 15:53:27 -07001301 if (inet->pmtudisc < IP_PMTUDISC_DO)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001302 skb->local_df = 1;
1303
1304 /* DF bit is set when we want to see DF on outgoing frames.
1305 * If local_df is set too, we still allow to fragment this frame
1306 * locally. */
John Heffner628a5c52007-04-20 15:53:27 -07001307 if (inet->pmtudisc >= IP_PMTUDISC_DO ||
Changli Gaod8d1f302010-06-10 23:31:35 -07001308 (skb->len <= dst_mtu(&rt->dst) &&
1309 ip_dont_fragment(sk, &rt->dst)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001310 df = htons(IP_DF);
1311
Herbert Xu1470ddf2011-03-01 02:36:47 +00001312 if (cork->flags & IPCORK_OPT)
1313 opt = cork->opt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001314
1315 if (rt->rt_type == RTN_MULTICAST)
1316 ttl = inet->mc_ttl;
1317 else
Changli Gaod8d1f302010-06-10 23:31:35 -07001318 ttl = ip_select_ttl(inet, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001319
1320 iph = (struct iphdr *)skb->data;
1321 iph->version = 4;
1322 iph->ihl = 5;
1323 if (opt) {
1324 iph->ihl += opt->optlen>>2;
Herbert Xu1470ddf2011-03-01 02:36:47 +00001325 ip_options_build(skb, opt, cork->addr, rt, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001326 }
1327 iph->tos = inet->tos;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001328 iph->frag_off = df;
Changli Gaod8d1f302010-06-10 23:31:35 -07001329 ip_select_ident(iph, &rt->dst, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001330 iph->ttl = ttl;
1331 iph->protocol = sk->sk_protocol;
1332 iph->saddr = rt->rt_src;
1333 iph->daddr = rt->rt_dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001334
1335 skb->priority = sk->sk_priority;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -08001336 skb->mark = sk->sk_mark;
Eric Dumazeta21bba92008-11-24 16:07:50 -08001337 /*
1338 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1339 * on dst refcount
1340 */
Herbert Xu1470ddf2011-03-01 02:36:47 +00001341 cork->dst = NULL;
Changli Gaod8d1f302010-06-10 23:31:35 -07001342 skb_dst_set(skb, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001343
David L Stevens96793b42007-09-17 09:57:33 -07001344 if (iph->protocol == IPPROTO_ICMP)
Pavel Emelyanov0388b002008-07-14 23:00:43 -07001345 icmp_out_count(net, ((struct icmphdr *)
David L Stevens96793b42007-09-17 09:57:33 -07001346 skb_transport_header(skb))->type);
1347
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001348 ip_cork_release(cork);
1349out:
1350 return skb;
1351}
1352
1353int ip_send_skb(struct sk_buff *skb)
1354{
1355 struct net *net = sock_net(skb->sk);
1356 int err;
1357
Herbert Xuc439cb22008-01-11 19:14:00 -08001358 err = ip_local_out(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001359 if (err) {
1360 if (err > 0)
Eric Dumazet6ce9e7b2009-09-02 18:05:33 -07001361 err = net_xmit_errno(err);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001362 if (err)
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001363 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001364 }
1365
Linus Torvalds1da177e2005-04-16 15:20:36 -07001366 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001367}
1368
Herbert Xu1470ddf2011-03-01 02:36:47 +00001369int ip_push_pending_frames(struct sock *sk)
1370{
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001371 struct sk_buff *skb;
1372
1373 skb = ip_finish_skb(sk);
1374 if (!skb)
1375 return 0;
1376
1377 /* Netfilter gets whole the not fragmented skb. */
1378 return ip_send_skb(skb);
Herbert Xu1470ddf2011-03-01 02:36:47 +00001379}
1380
Linus Torvalds1da177e2005-04-16 15:20:36 -07001381/*
1382 * Throw away all pending data on the socket.
1383 */
Herbert Xu1470ddf2011-03-01 02:36:47 +00001384static void __ip_flush_pending_frames(struct sock *sk,
1385 struct sk_buff_head *queue,
1386 struct inet_cork *cork)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001387{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001388 struct sk_buff *skb;
1389
Herbert Xu1470ddf2011-03-01 02:36:47 +00001390 while ((skb = __skb_dequeue_tail(queue)) != NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001391 kfree_skb(skb);
1392
Herbert Xu1470ddf2011-03-01 02:36:47 +00001393 ip_cork_release(cork);
1394}
1395
1396void ip_flush_pending_frames(struct sock *sk)
1397{
1398 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001399}
1400
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001401struct sk_buff *ip_make_skb(struct sock *sk,
1402 int getfrag(void *from, char *to, int offset,
1403 int len, int odd, struct sk_buff *skb),
1404 void *from, int length, int transhdrlen,
1405 struct ipcm_cookie *ipc, struct rtable **rtp,
1406 unsigned int flags)
1407{
1408 struct inet_cork cork = {};
1409 struct sk_buff_head queue;
1410 int err;
1411
1412 if (flags & MSG_PROBE)
1413 return NULL;
1414
1415 __skb_queue_head_init(&queue);
1416
1417 err = ip_setup_cork(sk, &cork, ipc, rtp);
1418 if (err)
1419 return ERR_PTR(err);
1420
1421 err = __ip_append_data(sk, &queue, &cork, getfrag,
1422 from, length, transhdrlen, flags);
1423 if (err) {
1424 __ip_flush_pending_frames(sk, &queue, &cork);
1425 return ERR_PTR(err);
1426 }
1427
1428 return __ip_make_skb(sk, &queue, &cork);
1429}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001430
1431/*
1432 * Fetch data from kernel space and fill in checksum if needed.
1433 */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001434static int ip_reply_glue_bits(void *dptr, char *to, int offset,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001435 int len, int odd, struct sk_buff *skb)
1436{
Al Viro50842052006-11-14 21:36:34 -08001437 __wsum csum;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001438
1439 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1440 skb->csum = csum_block_add(skb->csum, csum, odd);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001441 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001442}
1443
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001444/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001445 * Generic function to send a packet as reply to another packet.
1446 * Used to send TCP resets so far. ICMP should use this function too.
1447 *
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001448 * Should run single threaded per socket because it uses the sock
Linus Torvalds1da177e2005-04-16 15:20:36 -07001449 * structure to pass arguments.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001450 */
1451void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1452 unsigned int len)
1453{
1454 struct inet_sock *inet = inet_sk(sk);
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001455 struct ip_options_data replyopts;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001456 struct ipcm_cookie ipc;
Al Viro3ca3c682006-09-27 18:28:07 -07001457 __be32 daddr;
Eric Dumazet511c3f92009-06-02 05:14:27 +00001458 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001459
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001460 if (ip_options_echo(&replyopts.opt.opt, skb))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001461 return;
1462
1463 daddr = ipc.addr = rt->rt_src;
1464 ipc.opt = NULL;
Oliver Hartkopp2244d072010-08-17 08:59:14 +00001465 ipc.tx_flags = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001466
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001467 if (replyopts.opt.opt.optlen) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001468 ipc.opt = &replyopts.opt;
1469
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001470 if (replyopts.opt.opt.srr)
1471 daddr = replyopts.opt.opt.faddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001472 }
1473
1474 {
David S. Miller538de0e2011-03-31 04:53:37 -07001475 struct flowi4 fl4;
1476
1477 flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1478 RT_TOS(ip_hdr(skb)->tos),
1479 RT_SCOPE_UNIVERSE, sk->sk_protocol,
1480 ip_reply_arg_flowi_flags(arg),
1481 daddr, rt->rt_spec_dst,
1482 tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
David S. Miller9d6ec932011-03-12 01:12:47 -05001483 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1484 rt = ip_route_output_key(sock_net(sk), &fl4);
David S. Millerb23dd4f2011-03-02 14:31:35 -08001485 if (IS_ERR(rt))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001486 return;
1487 }
1488
1489 /* And let IP do all the hard work.
1490
1491 This chunk is not reenterable, hence spinlock.
1492 Note that it uses the fact, that this function is called
1493 with locally disabled BH and that sk cannot be already spinlocked.
1494 */
1495 bh_lock_sock(sk);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001496 inet->tos = ip_hdr(skb)->tos;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001497 sk->sk_priority = skb->priority;
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001498 sk->sk_protocol = ip_hdr(skb)->protocol;
Patrick McHardyf0e48db2007-06-04 21:32:46 -07001499 sk->sk_bound_dev_if = arg->bound_dev_if;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001500 ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
Eric Dumazet2e77d892008-11-24 15:52:46 -08001501 &ipc, &rt, MSG_DONTWAIT);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001502 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1503 if (arg->csumoffset >= 0)
Arnaldo Carvalho de Melo9c702202007-04-25 18:04:18 -07001504 *((__sum16 *)skb_transport_header(skb) +
1505 arg->csumoffset) = csum_fold(csum_add(skb->csum,
1506 arg->csum));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001507 skb->ip_summed = CHECKSUM_NONE;
1508 ip_push_pending_frames(sk);
1509 }
1510
1511 bh_unlock_sock(sk);
1512
1513 ip_rt_put(rt);
1514}
1515
Linus Torvalds1da177e2005-04-16 15:20:36 -07001516void __init ip_init(void)
1517{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001518 ip_rt_init();
1519 inet_initpeers();
1520
1521#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1522 igmp_mc_proc_init();
1523#endif
1524}