blob: a621b96aed157c57dc92112f849b1ed910c837f9 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * The Internet Protocol (IP) output module.
7 *
Jesper Juhl02c30a82005-05-05 16:16:16 -07008 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -07009 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Donald Becker, <becker@super.org>
11 * Alan Cox, <Alan.Cox@linux.org>
12 * Richard Underwood
13 * Stefan Becker, <stefanb@yello.ping.de>
14 * Jorge Cwik, <jorge@laser.satlink.net>
15 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 * Hirokazu Takahashi, <taka@valinux.co.jp>
17 *
18 * See ip_input.c for original log
19 *
20 * Fixes:
21 * Alan Cox : Missing nonblock feature in ip_build_xmit.
22 * Mike Kilburn : htons() missing in ip_build_xmit.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090023 * Bradford Johnson: Fix faulty handling of some frames when
Linus Torvalds1da177e2005-04-16 15:20:36 -070024 * no route is found.
25 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
26 * (in case if packet not accepted by
27 * output firewall rules)
28 * Mike McLagan : Routing by source
29 * Alexey Kuznetsov: use new route cache
30 * Andi Kleen: Fix broken PMTU recovery and remove
31 * some redundant tests.
32 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
33 * Andi Kleen : Replace ip_reply with ip_send_reply.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090034 * Andi Kleen : Split fast and slow ip_build_xmit path
35 * for decreased register pressure on x86
36 * and more readibility.
Linus Torvalds1da177e2005-04-16 15:20:36 -070037 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
38 * silently drop skb instead of failing with -EPERM.
39 * Detlev Wengorz : Copy protocol for fragments.
40 * Hirokazu Takahashi: HW checksumming for outgoing UDP
41 * datagrams.
42 * Hirokazu Takahashi: sendfile() on UDP works now.
43 */
44
45#include <asm/uaccess.h>
46#include <asm/system.h>
47#include <linux/module.h>
48#include <linux/types.h>
49#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070050#include <linux/mm.h>
51#include <linux/string.h>
52#include <linux/errno.h>
Al Viroa1f8e7f72006-10-19 16:08:53 -040053#include <linux/highmem.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090054#include <linux/slab.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070055
56#include <linux/socket.h>
57#include <linux/sockios.h>
58#include <linux/in.h>
59#include <linux/inet.h>
60#include <linux/netdevice.h>
61#include <linux/etherdevice.h>
62#include <linux/proc_fs.h>
63#include <linux/stat.h>
64#include <linux/init.h>
65
66#include <net/snmp.h>
67#include <net/ip.h>
68#include <net/protocol.h>
69#include <net/route.h>
Patrick McHardycfacb052006-01-08 22:36:54 -080070#include <net/xfrm.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070071#include <linux/skbuff.h>
72#include <net/sock.h>
73#include <net/arp.h>
74#include <net/icmp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070075#include <net/checksum.h>
76#include <net/inetpeer.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070077#include <linux/igmp.h>
78#include <linux/netfilter_ipv4.h>
79#include <linux/netfilter_bridge.h>
80#include <linux/mroute.h>
81#include <linux/netlink.h>
Arnaldo Carvalho de Melo6cbb0df2005-08-09 19:49:02 -070082#include <linux/tcp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070083
Brian Haleyab32ea52006-09-22 14:15:41 -070084int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
David S. Miller323e1262010-12-12 21:55:08 -080085EXPORT_SYMBOL(sysctl_ip_default_ttl);
Linus Torvalds1da177e2005-04-16 15:20:36 -070086
87/* Generate a checksum for an outgoing IP datagram. */
88__inline__ void ip_send_check(struct iphdr *iph)
89{
90 iph->check = 0;
91 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
92}
Eric Dumazet4bc2f182010-07-09 21:22:10 +000093EXPORT_SYMBOL(ip_send_check);
Linus Torvalds1da177e2005-04-16 15:20:36 -070094
Herbert Xuc439cb22008-01-11 19:14:00 -080095int __ip_local_out(struct sk_buff *skb)
96{
97 struct iphdr *iph = ip_hdr(skb);
98
99 iph->tot_len = htons(skb->len);
100 ip_send_check(iph);
Jan Engelhardt9bbc7682010-03-23 04:07:29 +0100101 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
102 skb_dst(skb)->dev, dst_output);
Herbert Xuc439cb22008-01-11 19:14:00 -0800103}
104
105int ip_local_out(struct sk_buff *skb)
106{
107 int err;
108
109 err = __ip_local_out(skb);
110 if (likely(err == 1))
111 err = dst_output(skb);
112
113 return err;
114}
115EXPORT_SYMBOL_GPL(ip_local_out);
116
Linus Torvalds1da177e2005-04-16 15:20:36 -0700117/* dev_loopback_xmit for use with netfilter. */
118static int ip_dev_loopback_xmit(struct sk_buff *newskb)
119{
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -0700120 skb_reset_mac_header(newskb);
Arnaldo Carvalho de Melobbe735e2007-03-10 22:16:10 -0300121 __skb_pull(newskb, skb_network_offset(newskb));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700122 newskb->pkt_type = PACKET_LOOPBACK;
123 newskb->ip_summed = CHECKSUM_UNNECESSARY;
Eric Dumazetadf30902009-06-02 05:19:30 +0000124 WARN_ON(!skb_dst(newskb));
Eric Dumazete30b38c2010-04-15 09:13:03 +0000125 netif_rx_ni(newskb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700126 return 0;
127}
128
129static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
130{
131 int ttl = inet->uc_ttl;
132
133 if (ttl < 0)
David S. Miller323e1262010-12-12 21:55:08 -0800134 ttl = ip4_dst_hoplimit(dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700135 return ttl;
136}
137
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900138/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700139 * Add an ip header to a skbuff and send it out.
140 *
141 */
142int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000143 __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700144{
145 struct inet_sock *inet = inet_sk(sk);
Eric Dumazet511c3f92009-06-02 05:14:27 +0000146 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700147 struct iphdr *iph;
148
149 /* Build the IP header. */
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000150 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
Arnaldo Carvalho de Melo8856dfa2007-03-10 19:40:39 -0300151 skb_reset_network_header(skb);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700152 iph = ip_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700153 iph->version = 4;
154 iph->ihl = 5;
155 iph->tos = inet->tos;
Changli Gaod8d1f302010-06-10 23:31:35 -0700156 if (ip_dont_fragment(sk, &rt->dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700157 iph->frag_off = htons(IP_DF);
158 else
159 iph->frag_off = 0;
Changli Gaod8d1f302010-06-10 23:31:35 -0700160 iph->ttl = ip_select_ttl(inet, &rt->dst);
David S. Millerdd927a22011-05-04 12:03:30 -0700161 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
162 iph->saddr = saddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700163 iph->protocol = sk->sk_protocol;
Changli Gaod8d1f302010-06-10 23:31:35 -0700164 ip_select_ident(iph, &rt->dst, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700165
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000166 if (opt && opt->opt.optlen) {
167 iph->ihl += opt->opt.optlen>>2;
168 ip_options_build(skb, &opt->opt, daddr, rt, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700169 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700170
171 skb->priority = sk->sk_priority;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800172 skb->mark = sk->sk_mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700173
174 /* Send it out. */
Herbert Xuc439cb22008-01-11 19:14:00 -0800175 return ip_local_out(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700176}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -0700177EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
178
Linus Torvalds1da177e2005-04-16 15:20:36 -0700179static inline int ip_finish_output2(struct sk_buff *skb)
180{
Eric Dumazetadf30902009-06-02 05:19:30 +0000181 struct dst_entry *dst = skb_dst(skb);
Mitsuru Chinen80787eb2007-04-30 00:48:20 -0700182 struct rtable *rt = (struct rtable *)dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700183 struct net_device *dev = dst->dev;
Chuck Leverc2636b42007-10-23 21:07:32 -0700184 unsigned int hh_len = LL_RESERVED_SPACE(dev);
David S. Millerf6b72b62011-07-14 07:53:20 -0700185 struct neighbour *neigh;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700186
Neil Hormanedf391f2009-04-27 02:45:02 -0700187 if (rt->rt_type == RTN_MULTICAST) {
188 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
189 } else if (rt->rt_type == RTN_BROADCAST)
190 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
Mitsuru Chinen80787eb2007-04-30 00:48:20 -0700191
Linus Torvalds1da177e2005-04-16 15:20:36 -0700192 /* Be paranoid, rather than too clever. */
Stephen Hemminger3b04ddd2007-10-09 01:40:57 -0700193 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700194 struct sk_buff *skb2;
195
196 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
197 if (skb2 == NULL) {
198 kfree_skb(skb);
199 return -ENOMEM;
200 }
201 if (skb->sk)
202 skb_set_owner_w(skb2, skb->sk);
203 kfree_skb(skb);
204 skb = skb2;
205 }
206
David S. Millerf6b72b62011-07-14 07:53:20 -0700207 neigh = dst->neighbour;
208 if (neigh) {
209 struct hh_cache *hh = &neigh->hh;
210 if (hh->hh_len)
211 return neigh_hh_output(hh, skb);
212 else
213 return dst->neighbour->output(skb);
214 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700215 if (net_ratelimit())
216 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
217 kfree_skb(skb);
218 return -EINVAL;
219}
220
John Heffner628a5c52007-04-20 15:53:27 -0700221static inline int ip_skb_dst_mtu(struct sk_buff *skb)
222{
223 struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
224
225 return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
Eric Dumazetadf30902009-06-02 05:19:30 +0000226 skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
John Heffner628a5c52007-04-20 15:53:27 -0700227}
228
Patrick McHardy861d0482007-10-15 01:48:39 -0700229static int ip_finish_output(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700230{
Patrick McHardy5c901da2006-01-06 23:05:36 -0800231#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
232 /* Policy lookup after SNAT yielded a new policy */
Eric Dumazetadf30902009-06-02 05:19:30 +0000233 if (skb_dst(skb)->xfrm != NULL) {
Patrick McHardy48d5cad2006-02-15 15:10:22 -0800234 IPCB(skb)->flags |= IPSKB_REROUTED;
235 return dst_output(skb);
236 }
Patrick McHardy5c901da2006-01-06 23:05:36 -0800237#endif
John Heffner628a5c52007-04-20 15:53:27 -0700238 if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
Patrick McHardy1bd9bef2006-01-05 12:20:59 -0800239 return ip_fragment(skb, ip_finish_output2);
240 else
241 return ip_finish_output2(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700242}
243
244int ip_mc_output(struct sk_buff *skb)
245{
246 struct sock *sk = skb->sk;
Eric Dumazet511c3f92009-06-02 05:14:27 +0000247 struct rtable *rt = skb_rtable(skb);
Changli Gaod8d1f302010-06-10 23:31:35 -0700248 struct net_device *dev = rt->dst.dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700249
250 /*
251 * If the indicated interface is up and running, send the packet.
252 */
Neil Hormanedf391f2009-04-27 02:45:02 -0700253 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700254
255 skb->dev = dev;
256 skb->protocol = htons(ETH_P_IP);
257
258 /*
259 * Multicasts are looped back for other local users
260 */
261
262 if (rt->rt_flags&RTCF_MULTICAST) {
Octavian Purdila7ad68482010-01-06 20:37:01 -0800263 if (sk_mc_loop(sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700264#ifdef CONFIG_IP_MROUTE
265 /* Small optimization: do not loopback not local frames,
266 which returned after forwarding; they will be dropped
267 by ip_mr_input in any case.
268 Note, that local frames are looped back to be delivered
269 to local recipients.
270
271 This check is duplicated in ip_mr_input at the moment.
272 */
Joe Perches9d4fb272009-11-23 10:41:23 -0800273 &&
274 ((rt->rt_flags & RTCF_LOCAL) ||
275 !(IPCB(skb)->flags & IPSKB_FORWARDED))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700276#endif
Joe Perches9d4fb272009-11-23 10:41:23 -0800277 ) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700278 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
279 if (newskb)
Jan Engelhardt9bbc7682010-03-23 04:07:29 +0100280 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
281 newskb, NULL, newskb->dev,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700282 ip_dev_loopback_xmit);
283 }
284
285 /* Multicasts with ttl 0 must not go beyond the host */
286
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700287 if (ip_hdr(skb)->ttl == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700288 kfree_skb(skb);
289 return 0;
290 }
291 }
292
293 if (rt->rt_flags&RTCF_BROADCAST) {
294 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
295 if (newskb)
Jan Engelhardt9bbc7682010-03-23 04:07:29 +0100296 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
297 NULL, newskb->dev, ip_dev_loopback_xmit);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700298 }
299
Jan Engelhardt9bbc7682010-03-23 04:07:29 +0100300 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
301 skb->dev, ip_finish_output,
Patrick McHardy48d5cad2006-02-15 15:10:22 -0800302 !(IPCB(skb)->flags & IPSKB_REROUTED));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700303}
304
305int ip_output(struct sk_buff *skb)
306{
Eric Dumazetadf30902009-06-02 05:19:30 +0000307 struct net_device *dev = skb_dst(skb)->dev;
Patrick McHardy1bd9bef2006-01-05 12:20:59 -0800308
Neil Hormanedf391f2009-04-27 02:45:02 -0700309 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700310
Patrick McHardy1bd9bef2006-01-05 12:20:59 -0800311 skb->dev = dev;
312 skb->protocol = htons(ETH_P_IP);
313
Jan Engelhardt9bbc7682010-03-23 04:07:29 +0100314 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900315 ip_finish_output,
Patrick McHardy48d5cad2006-02-15 15:10:22 -0800316 !(IPCB(skb)->flags & IPSKB_REROUTED));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700317}
318
David S. Millerd9d8da82011-05-06 22:23:20 -0700319int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700320{
David S. Millere89862f2007-01-26 01:04:55 -0800321 struct sock *sk = skb->sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700322 struct inet_sock *inet = inet_sk(sk);
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000323 struct ip_options_rcu *inet_opt;
David S. Millerb57ae012011-05-06 16:24:06 -0700324 struct flowi4 *fl4;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700325 struct rtable *rt;
326 struct iphdr *iph;
Eric Dumazetab6e3fe2010-05-10 11:31:49 +0000327 int res;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700328
329 /* Skip all of this if the packet is already routed,
330 * f.e. by something like SCTP.
331 */
Eric Dumazetab6e3fe2010-05-10 11:31:49 +0000332 rcu_read_lock();
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000333 inet_opt = rcu_dereference(inet->inet_opt);
David S. Millerea4fc0d2011-05-06 22:30:20 -0700334 fl4 = &fl->u.ip4;
Eric Dumazet511c3f92009-06-02 05:14:27 +0000335 rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700336 if (rt != NULL)
337 goto packet_routed;
338
339 /* Make sure we can route this packet. */
340 rt = (struct rtable *)__sk_dst_check(sk, 0);
341 if (rt == NULL) {
Al Viro3ca3c682006-09-27 18:28:07 -0700342 __be32 daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700343
344 /* Use correct destination address if we have options. */
Eric Dumazetc720c7e2009-10-15 06:30:45 +0000345 daddr = inet->inet_daddr;
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000346 if (inet_opt && inet_opt->opt.srr)
347 daddr = inet_opt->opt.faddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700348
David S. Miller78fbfd82011-03-12 00:00:52 -0500349 /* If this fails, retransmit mechanism of transport layer will
350 * keep trying until route appears or the connection times
351 * itself out.
352 */
David S. Millerb57ae012011-05-06 16:24:06 -0700353 rt = ip_route_output_ports(sock_net(sk), fl4, sk,
David S. Miller78fbfd82011-03-12 00:00:52 -0500354 daddr, inet->inet_saddr,
355 inet->inet_dport,
356 inet->inet_sport,
357 sk->sk_protocol,
358 RT_CONN_FLAGS(sk),
359 sk->sk_bound_dev_if);
360 if (IS_ERR(rt))
361 goto no_route;
Changli Gaod8d1f302010-06-10 23:31:35 -0700362 sk_setup_caps(sk, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700363 }
Changli Gaod8d1f302010-06-10 23:31:35 -0700364 skb_dst_set_noref(skb, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700365
366packet_routed:
David S. Millerea4fc0d2011-05-06 22:30:20 -0700367 if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700368 goto no_route;
369
370 /* OK, we know where to send it, allocate and build IP header. */
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000371 skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
Arnaldo Carvalho de Melo8856dfa2007-03-10 19:40:39 -0300372 skb_reset_network_header(skb);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700373 iph = ip_hdr(skb);
Al Viro714e85b2006-11-14 20:51:49 -0800374 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
Changli Gaod8d1f302010-06-10 23:31:35 -0700375 if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700376 iph->frag_off = htons(IP_DF);
377 else
378 iph->frag_off = 0;
Changli Gaod8d1f302010-06-10 23:31:35 -0700379 iph->ttl = ip_select_ttl(inet, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700380 iph->protocol = sk->sk_protocol;
David S. Millerea4fc0d2011-05-06 22:30:20 -0700381 iph->saddr = fl4->saddr;
382 iph->daddr = fl4->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700383 /* Transport layer set skb->h.foo itself. */
384
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000385 if (inet_opt && inet_opt->opt.optlen) {
386 iph->ihl += inet_opt->opt.optlen >> 2;
387 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700388 }
389
Changli Gaod8d1f302010-06-10 23:31:35 -0700390 ip_select_ident_more(iph, &rt->dst, sk,
Herbert Xu79671682006-06-22 02:40:14 -0700391 (skb_shinfo(skb)->gso_segs ?: 1) - 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700392
Linus Torvalds1da177e2005-04-16 15:20:36 -0700393 skb->priority = sk->sk_priority;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800394 skb->mark = sk->sk_mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700395
Eric Dumazetab6e3fe2010-05-10 11:31:49 +0000396 res = ip_local_out(skb);
397 rcu_read_unlock();
398 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700399
400no_route:
Eric Dumazetab6e3fe2010-05-10 11:31:49 +0000401 rcu_read_unlock();
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700402 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700403 kfree_skb(skb);
404 return -EHOSTUNREACH;
405}
Eric Dumazet4bc2f182010-07-09 21:22:10 +0000406EXPORT_SYMBOL(ip_queue_xmit);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700407
408
409static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
410{
411 to->pkt_type = from->pkt_type;
412 to->priority = from->priority;
413 to->protocol = from->protocol;
Eric Dumazetadf30902009-06-02 05:19:30 +0000414 skb_dst_drop(to);
Eric Dumazetfe76cda2010-07-01 23:48:22 +0000415 skb_dst_copy(to, from);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700416 to->dev = from->dev;
Thomas Graf82e91ff2006-11-09 15:19:14 -0800417 to->mark = from->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700418
419 /* Copy the flags to each fragment. */
420 IPCB(to)->flags = IPCB(from)->flags;
421
422#ifdef CONFIG_NET_SCHED
423 to->tc_index = from->tc_index;
424#endif
Yasuyuki Kozakaie7ac05f2007-03-14 16:44:01 -0700425 nf_copy(to, from);
Jozsef Kadlecsikba9dda32007-07-07 22:21:23 -0700426#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
427 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
428 to->nf_trace = from->nf_trace;
429#endif
Julian Anastasovc98d80e2005-10-22 13:39:21 +0300430#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
431 to->ipvs_property = from->ipvs_property;
432#endif
James Morris984bc162006-06-09 00:29:17 -0700433 skb_copy_secmark(to, from);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700434}
435
436/*
437 * This IP datagram is too large to be sent in one piece. Break it up into
438 * smaller pieces (each of size equal to IP header plus
439 * a block of the data of the original IP data part) that will yet fit in a
440 * single device frame, and queue such a frame for sending.
441 */
442
Jianjun Kongd93191002008-11-03 00:23:42 -0800443int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700444{
445 struct iphdr *iph;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700446 int ptr;
447 struct net_device *dev;
448 struct sk_buff *skb2;
Changli Gaoc893b802010-07-31 13:25:08 +0000449 unsigned int mtu, hlen, left, len, ll_rs;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700450 int offset;
Alexey Dobriyan76ab6082006-01-06 13:24:29 -0800451 __be16 not_last_frag;
Eric Dumazet511c3f92009-06-02 05:14:27 +0000452 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700453 int err = 0;
454
Changli Gaod8d1f302010-06-10 23:31:35 -0700455 dev = rt->dst.dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700456
457 /*
458 * Point into the IP datagram header.
459 */
460
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700461 iph = ip_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700462
463 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700464 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700465 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
John Heffner628a5c52007-04-20 15:53:27 -0700466 htonl(ip_skb_dst_mtu(skb)));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700467 kfree_skb(skb);
468 return -EMSGSIZE;
469 }
470
471 /*
472 * Setup starting values.
473 */
474
475 hlen = iph->ihl * 4;
Changli Gaod8d1f302010-06-10 23:31:35 -0700476 mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
Bart De Schuymer6c79bf02010-04-20 16:22:01 +0200477#ifdef CONFIG_BRIDGE_NETFILTER
478 if (skb->nf_bridge)
479 mtu -= nf_bridge_mtu_reduction(skb);
480#endif
Herbert Xu89cee8b2005-12-13 23:14:27 -0800481 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700482
483 /* When frag_list is given, use it. First, check its validity:
484 * some transformers could create wrong frag_list or break existing
485 * one, it is not prohibited. In this case fall back to copying.
486 *
487 * LATER: this step can be merged to real generation of fragments,
488 * we can switch to copy when see the first bad fragment.
489 */
David S. Miller21dc3302010-08-23 00:13:46 -0700490 if (skb_has_frag_list(skb)) {
Eric Dumazet3d130082010-09-21 08:47:45 +0000491 struct sk_buff *frag, *frag2;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700492 int first_len = skb_pagelen(skb);
493
494 if (first_len - hlen > mtu ||
495 ((first_len - hlen) & 7) ||
Paul Gortmaker56f8a752011-06-21 20:33:34 -0700496 ip_is_fragment(iph) ||
Linus Torvalds1da177e2005-04-16 15:20:36 -0700497 skb_cloned(skb))
498 goto slow_path;
499
David S. Millerd7fcf1a2009-06-09 00:19:37 -0700500 skb_walk_frags(skb, frag) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700501 /* Correct geometry. */
502 if (frag->len > mtu ||
503 ((frag->len & 7) && frag->next) ||
504 skb_headroom(frag) < hlen)
Eric Dumazet3d130082010-09-21 08:47:45 +0000505 goto slow_path_clean;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700506
507 /* Partially cloned skb? */
508 if (skb_shared(frag))
Eric Dumazet3d130082010-09-21 08:47:45 +0000509 goto slow_path_clean;
Herbert Xu2fdba6b2005-05-18 22:52:33 -0700510
511 BUG_ON(frag->sk);
512 if (skb->sk) {
Herbert Xu2fdba6b2005-05-18 22:52:33 -0700513 frag->sk = skb->sk;
514 frag->destructor = sock_wfree;
Herbert Xu2fdba6b2005-05-18 22:52:33 -0700515 }
Eric Dumazet3d130082010-09-21 08:47:45 +0000516 skb->truesize -= frag->truesize;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700517 }
518
519 /* Everything is OK. Generate! */
520
521 err = 0;
522 offset = 0;
523 frag = skb_shinfo(skb)->frag_list;
David S. Millerd7fcf1a2009-06-09 00:19:37 -0700524 skb_frag_list_init(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700525 skb->data_len = first_len - skb_headlen(skb);
526 skb->len = first_len;
527 iph->tot_len = htons(first_len);
528 iph->frag_off = htons(IP_MF);
529 ip_send_check(iph);
530
531 for (;;) {
532 /* Prepare header of the next frame,
533 * before previous one went down. */
534 if (frag) {
535 frag->ip_summed = CHECKSUM_NONE;
Arnaldo Carvalho de Melobadff6d2007-03-13 13:06:52 -0300536 skb_reset_transport_header(frag);
Arnaldo Carvalho de Meloe2d1bca2007-04-10 20:46:21 -0700537 __skb_push(frag, hlen);
538 skb_reset_network_header(frag);
Arnaldo Carvalho de Melod56f90a2007-04-10 20:50:43 -0700539 memcpy(skb_network_header(frag), iph, hlen);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700540 iph = ip_hdr(frag);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700541 iph->tot_len = htons(frag->len);
542 ip_copy_metadata(frag, skb);
543 if (offset == 0)
544 ip_options_fragment(frag);
545 offset += skb->len - hlen;
546 iph->frag_off = htons(offset>>3);
547 if (frag->next != NULL)
548 iph->frag_off |= htons(IP_MF);
549 /* Ready, complete checksum */
550 ip_send_check(iph);
551 }
552
553 err = output(skb);
554
Wei Dongdafee492006-08-02 13:41:21 -0700555 if (!err)
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700556 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700557 if (err || !frag)
558 break;
559
560 skb = frag;
561 frag = skb->next;
562 skb->next = NULL;
563 }
564
565 if (err == 0) {
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700566 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700567 return 0;
568 }
569
570 while (frag) {
571 skb = frag->next;
572 kfree_skb(frag);
573 frag = skb;
574 }
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700575 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700576 return err;
Eric Dumazet3d130082010-09-21 08:47:45 +0000577
578slow_path_clean:
579 skb_walk_frags(skb, frag2) {
580 if (frag2 == frag)
581 break;
582 frag2->sk = NULL;
583 frag2->destructor = NULL;
584 skb->truesize += frag2->truesize;
585 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700586 }
587
588slow_path:
589 left = skb->len - hlen; /* Space per frame */
George Kadianakis49085bd2010-07-06 11:44:12 +0000590 ptr = hlen; /* Where to start from */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700591
Linus Torvalds1da177e2005-04-16 15:20:36 -0700592 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
Stephen Hemminger9bcfcaf2006-08-29 17:48:57 -0700593 * we need to make room for the encapsulating header
594 */
Changli Gaoc893b802010-07-31 13:25:08 +0000595 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
Stephen Hemminger9bcfcaf2006-08-29 17:48:57 -0700596
Linus Torvalds1da177e2005-04-16 15:20:36 -0700597 /*
598 * Fragment the datagram.
599 */
600
601 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
602 not_last_frag = iph->frag_off & htons(IP_MF);
603
604 /*
605 * Keep copying data until we run out.
606 */
607
Stephen Hemminger132adf52007-03-08 20:44:43 -0800608 while (left > 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700609 len = left;
610 /* IF: it doesn't fit, use 'mtu' - the data space left */
611 if (len > mtu)
612 len = mtu;
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300613 /* IF: we are not sending up to and including the packet end
Linus Torvalds1da177e2005-04-16 15:20:36 -0700614 then align the next start on an eight byte boundary */
615 if (len < left) {
616 len &= ~7;
617 }
618 /*
619 * Allocate buffer.
620 */
621
622 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
Patrick McHardy64ce2072005-08-09 20:50:53 -0700623 NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700624 err = -ENOMEM;
625 goto fail;
626 }
627
628 /*
629 * Set up data on packet
630 */
631
632 ip_copy_metadata(skb2, skb);
633 skb_reserve(skb2, ll_rs);
634 skb_put(skb2, len + hlen);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -0700635 skb_reset_network_header(skb2);
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -0700636 skb2->transport_header = skb2->network_header + hlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700637
638 /*
639 * Charge the memory for the fragment to any owner
640 * it might possess
641 */
642
643 if (skb->sk)
644 skb_set_owner_w(skb2, skb->sk);
645
646 /*
647 * Copy the packet header into the new buffer.
648 */
649
Arnaldo Carvalho de Melod626f622007-03-27 18:55:52 -0300650 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700651
652 /*
653 * Copy a block of the IP datagram.
654 */
Arnaldo Carvalho de Melobff9b612007-03-16 17:19:57 -0300655 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700656 BUG();
657 left -= len;
658
659 /*
660 * Fill in the new header fields.
661 */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700662 iph = ip_hdr(skb2);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700663 iph->frag_off = htons((offset >> 3));
664
665 /* ANK: dirty, but effective trick. Upgrade options only if
666 * the segment to be fragmented was THE FIRST (otherwise,
667 * options are already fixed) and make it ONCE
668 * on the initial skb, so that all the following fragments
669 * will inherit fixed options.
670 */
671 if (offset == 0)
672 ip_options_fragment(skb);
673
674 /*
675 * Added AC : If we are fragmenting a fragment that's not the
676 * last fragment then keep MF on each bit
677 */
678 if (left > 0 || not_last_frag)
679 iph->frag_off |= htons(IP_MF);
680 ptr += len;
681 offset += len;
682
683 /*
684 * Put this fragment into the sending queue.
685 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700686 iph->tot_len = htons(len + hlen);
687
688 ip_send_check(iph);
689
690 err = output(skb2);
691 if (err)
692 goto fail;
Wei Dongdafee492006-08-02 13:41:21 -0700693
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700694 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700695 }
696 kfree_skb(skb);
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700697 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700698 return err;
699
700fail:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900701 kfree_skb(skb);
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700702 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700703 return err;
704}
Patrick McHardy2e2f7ae2006-04-04 13:42:35 -0700705EXPORT_SYMBOL(ip_fragment);
706
Linus Torvalds1da177e2005-04-16 15:20:36 -0700707int
708ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
709{
710 struct iovec *iov = from;
711
Patrick McHardy84fa7932006-08-29 16:44:56 -0700712 if (skb->ip_summed == CHECKSUM_PARTIAL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700713 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
714 return -EFAULT;
715 } else {
Al Viro44bb9362006-11-14 21:36:14 -0800716 __wsum csum = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700717 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
718 return -EFAULT;
719 skb->csum = csum_block_add(skb->csum, csum, odd);
720 }
721 return 0;
722}
Eric Dumazet4bc2f182010-07-09 21:22:10 +0000723EXPORT_SYMBOL(ip_generic_getfrag);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700724
Al Viro44bb9362006-11-14 21:36:14 -0800725static inline __wsum
Linus Torvalds1da177e2005-04-16 15:20:36 -0700726csum_page(struct page *page, int offset, int copy)
727{
728 char *kaddr;
Al Viro44bb9362006-11-14 21:36:14 -0800729 __wsum csum;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700730 kaddr = kmap(page);
731 csum = csum_partial(kaddr + offset, copy, 0);
732 kunmap(page);
733 return csum;
734}
735
Adrian Bunk4b30b1c2005-11-29 16:27:20 -0800736static inline int ip_ufo_append_data(struct sock *sk,
Herbert Xu1470ddf2011-03-01 02:36:47 +0000737 struct sk_buff_head *queue,
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700738 int getfrag(void *from, char *to, int offset, int len,
739 int odd, struct sk_buff *skb),
740 void *from, int length, int hh_len, int fragheaderlen,
Jianjun Kongd93191002008-11-03 00:23:42 -0800741 int transhdrlen, int mtu, unsigned int flags)
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700742{
743 struct sk_buff *skb;
744 int err;
745
746 /* There is support for UDP fragmentation offload by network
747 * device, so create one single skb packet containing complete
748 * udp datagram
749 */
Herbert Xu1470ddf2011-03-01 02:36:47 +0000750 if ((skb = skb_peek_tail(queue)) == NULL) {
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700751 skb = sock_alloc_send_skb(sk,
752 hh_len + fragheaderlen + transhdrlen + 20,
753 (flags & MSG_DONTWAIT), &err);
754
755 if (skb == NULL)
756 return err;
757
758 /* reserve space for Hardware header */
759 skb_reserve(skb, hh_len);
760
761 /* create space for UDP/IP header */
Jianjun Kongd93191002008-11-03 00:23:42 -0800762 skb_put(skb, fragheaderlen + transhdrlen);
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700763
764 /* initialize network header pointer */
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -0700765 skb_reset_network_header(skb);
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700766
767 /* initialize protocol header pointer */
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -0700768 skb->transport_header = skb->network_header + fragheaderlen;
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700769
Patrick McHardy84fa7932006-08-29 16:44:56 -0700770 skb->ip_summed = CHECKSUM_PARTIAL;
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700771 skb->csum = 0;
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700772
Kostya Bbe9164e2008-04-29 22:36:30 -0700773 /* specify the length of each IP datagram fragment */
Herbert Xu79671682006-06-22 02:40:14 -0700774 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
Herbert Xuf83ef8c2006-06-30 13:37:03 -0700775 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
Herbert Xu1470ddf2011-03-01 02:36:47 +0000776 __skb_queue_tail(queue, skb);
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700777 }
Kostya Bbe9164e2008-04-29 22:36:30 -0700778
779 return skb_append_datato_frags(sk, skb, getfrag, from,
780 (length - transhdrlen));
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700781}
782
David S. Millerf5fca602011-05-08 17:24:10 -0700783static int __ip_append_data(struct sock *sk,
784 struct flowi4 *fl4,
785 struct sk_buff_head *queue,
Herbert Xu1470ddf2011-03-01 02:36:47 +0000786 struct inet_cork *cork,
787 int getfrag(void *from, char *to, int offset,
788 int len, int odd, struct sk_buff *skb),
789 void *from, int length, int transhdrlen,
790 unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700791{
792 struct inet_sock *inet = inet_sk(sk);
793 struct sk_buff *skb;
794
Herbert Xu07df5292011-03-01 23:00:58 -0800795 struct ip_options *opt = cork->opt;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700796 int hh_len;
797 int exthdrlen;
798 int mtu;
799 int copy;
800 int err;
801 int offset = 0;
802 unsigned int maxfraglen, fragheaderlen;
803 int csummode = CHECKSUM_NONE;
Herbert Xu1470ddf2011-03-01 02:36:47 +0000804 struct rtable *rt = (struct rtable *)cork->dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700805
Steffen Klassert96d73032011-06-05 20:48:47 +0000806 skb = skb_peek_tail(queue);
807
808 exthdrlen = !skb ? rt->dst.header_len : 0;
Herbert Xu07df5292011-03-01 23:00:58 -0800809 mtu = cork->fragsize;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700810
Changli Gaod8d1f302010-06-10 23:31:35 -0700811 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700812
813 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
814 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
815
Herbert Xu1470ddf2011-03-01 02:36:47 +0000816 if (cork->length + length > 0xFFFF - fragheaderlen) {
David S. Millerf5fca602011-05-08 17:24:10 -0700817 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
Eric Dumazetc720c7e2009-10-15 06:30:45 +0000818 mtu-exthdrlen);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700819 return -EMSGSIZE;
820 }
821
822 /*
823 * transhdrlen > 0 means that this is the first fragment and we wish
824 * it won't be fragmented in the future.
825 */
826 if (transhdrlen &&
827 length + fragheaderlen <= mtu &&
Changli Gaod8d1f302010-06-10 23:31:35 -0700828 rt->dst.dev->features & NETIF_F_V4_CSUM &&
Linus Torvalds1da177e2005-04-16 15:20:36 -0700829 !exthdrlen)
Patrick McHardy84fa7932006-08-29 16:44:56 -0700830 csummode = CHECKSUM_PARTIAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700831
Herbert Xu1470ddf2011-03-01 02:36:47 +0000832 cork->length += length;
Herbert Xu26cde9f2010-06-15 01:52:25 +0000833 if (((length > mtu) || (skb && skb_is_gso(skb))) &&
Kostya Bbe9164e2008-04-29 22:36:30 -0700834 (sk->sk_protocol == IPPROTO_UDP) &&
Steffen Klassertc1460662011-06-29 23:19:32 +0000835 (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
Herbert Xu1470ddf2011-03-01 02:36:47 +0000836 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
837 hh_len, fragheaderlen, transhdrlen,
838 mtu, flags);
Patrick McHardybaa829d2006-03-12 20:35:12 -0800839 if (err)
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700840 goto error;
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700841 return 0;
842 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700843
844 /* So, what's going on in the loop below?
845 *
846 * We use calculated fragment length to generate chained skb,
847 * each of segments is IP fragment ready for sending to network after
848 * adding appropriate IP header.
849 */
850
Herbert Xu26cde9f2010-06-15 01:52:25 +0000851 if (!skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700852 goto alloc_new_skb;
853
854 while (length > 0) {
855 /* Check if the remaining data fits into current packet. */
856 copy = mtu - skb->len;
857 if (copy < length)
858 copy = maxfraglen - skb->len;
859 if (copy <= 0) {
860 char *data;
861 unsigned int datalen;
862 unsigned int fraglen;
863 unsigned int fraggap;
864 unsigned int alloclen;
865 struct sk_buff *skb_prev;
866alloc_new_skb:
867 skb_prev = skb;
868 if (skb_prev)
869 fraggap = skb_prev->len - maxfraglen;
870 else
871 fraggap = 0;
872
873 /*
874 * If remaining data exceeds the mtu,
875 * we know we need more fragment(s).
876 */
877 datalen = length + fraggap;
878 if (datalen > mtu - fragheaderlen)
879 datalen = maxfraglen - fragheaderlen;
880 fraglen = datalen + fragheaderlen;
881
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900882 if ((flags & MSG_MORE) &&
Changli Gaod8d1f302010-06-10 23:31:35 -0700883 !(rt->dst.dev->features&NETIF_F_SG))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700884 alloclen = mtu;
885 else
Eric Dumazet59104f02010-09-20 20:16:27 +0000886 alloclen = fraglen;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700887
Steffen Klassert353e5c92011-06-22 01:05:37 +0000888 alloclen += exthdrlen;
889
Linus Torvalds1da177e2005-04-16 15:20:36 -0700890 /* The last fragment gets additional space at tail.
891 * Note, with MSG_MORE we overallocate on fragments,
892 * because we have no idea what fragment will be
893 * the last.
894 */
Steffen Klassert33f99dc2011-06-22 01:04:37 +0000895 if (datalen == length + fraggap)
Changli Gaod8d1f302010-06-10 23:31:35 -0700896 alloclen += rt->dst.trailer_len;
Steffen Klassert33f99dc2011-06-22 01:04:37 +0000897
Linus Torvalds1da177e2005-04-16 15:20:36 -0700898 if (transhdrlen) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900899 skb = sock_alloc_send_skb(sk,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700900 alloclen + hh_len + 15,
901 (flags & MSG_DONTWAIT), &err);
902 } else {
903 skb = NULL;
904 if (atomic_read(&sk->sk_wmem_alloc) <=
905 2 * sk->sk_sndbuf)
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900906 skb = sock_wmalloc(sk,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700907 alloclen + hh_len + 15, 1,
908 sk->sk_allocation);
909 if (unlikely(skb == NULL))
910 err = -ENOBUFS;
Patrick Ohly51f31ca2009-02-12 05:03:39 +0000911 else
912 /* only the initial fragment is
913 time stamped */
Herbert Xu1470ddf2011-03-01 02:36:47 +0000914 cork->tx_flags = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700915 }
916 if (skb == NULL)
917 goto error;
918
919 /*
920 * Fill in the control structures
921 */
922 skb->ip_summed = csummode;
923 skb->csum = 0;
924 skb_reserve(skb, hh_len);
Herbert Xu1470ddf2011-03-01 02:36:47 +0000925 skb_shinfo(skb)->tx_flags = cork->tx_flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700926
927 /*
928 * Find where to start putting bytes.
929 */
Steffen Klassert353e5c92011-06-22 01:05:37 +0000930 data = skb_put(skb, fraglen + exthdrlen);
Arnaldo Carvalho de Meloc14d2452007-03-11 22:39:41 -0300931 skb_set_network_header(skb, exthdrlen);
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -0700932 skb->transport_header = (skb->network_header +
933 fragheaderlen);
Steffen Klassert353e5c92011-06-22 01:05:37 +0000934 data += fragheaderlen + exthdrlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700935
936 if (fraggap) {
937 skb->csum = skb_copy_and_csum_bits(
938 skb_prev, maxfraglen,
939 data + transhdrlen, fraggap, 0);
940 skb_prev->csum = csum_sub(skb_prev->csum,
941 skb->csum);
942 data += fraggap;
Herbert Xue9fa4f72006-08-13 20:12:58 -0700943 pskb_trim_unique(skb_prev, maxfraglen);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700944 }
945
946 copy = datalen - transhdrlen - fraggap;
947 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
948 err = -EFAULT;
949 kfree_skb(skb);
950 goto error;
951 }
952
953 offset += copy;
954 length -= datalen - fraggap;
955 transhdrlen = 0;
956 exthdrlen = 0;
957 csummode = CHECKSUM_NONE;
958
959 /*
960 * Put the packet on the pending queue.
961 */
Herbert Xu1470ddf2011-03-01 02:36:47 +0000962 __skb_queue_tail(queue, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700963 continue;
964 }
965
966 if (copy > length)
967 copy = length;
968
Changli Gaod8d1f302010-06-10 23:31:35 -0700969 if (!(rt->dst.dev->features&NETIF_F_SG)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700970 unsigned int off;
971
972 off = skb->len;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900973 if (getfrag(from, skb_put(skb, copy),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700974 offset, copy, off, skb) < 0) {
975 __skb_trim(skb, off);
976 err = -EFAULT;
977 goto error;
978 }
979 } else {
980 int i = skb_shinfo(skb)->nr_frags;
981 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
Herbert Xu1470ddf2011-03-01 02:36:47 +0000982 struct page *page = cork->page;
983 int off = cork->off;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700984 unsigned int left;
985
986 if (page && (left = PAGE_SIZE - off) > 0) {
987 if (copy >= left)
988 copy = left;
989 if (page != frag->page) {
990 if (i == MAX_SKB_FRAGS) {
991 err = -EMSGSIZE;
992 goto error;
993 }
994 get_page(page);
Herbert Xu1470ddf2011-03-01 02:36:47 +0000995 skb_fill_page_desc(skb, i, page, off, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700996 frag = &skb_shinfo(skb)->frags[i];
997 }
998 } else if (i < MAX_SKB_FRAGS) {
999 if (copy > PAGE_SIZE)
1000 copy = PAGE_SIZE;
1001 page = alloc_pages(sk->sk_allocation, 0);
1002 if (page == NULL) {
1003 err = -ENOMEM;
1004 goto error;
1005 }
Herbert Xu1470ddf2011-03-01 02:36:47 +00001006 cork->page = page;
1007 cork->off = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001008
1009 skb_fill_page_desc(skb, i, page, 0, 0);
1010 frag = &skb_shinfo(skb)->frags[i];
Linus Torvalds1da177e2005-04-16 15:20:36 -07001011 } else {
1012 err = -EMSGSIZE;
1013 goto error;
1014 }
1015 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1016 err = -EFAULT;
1017 goto error;
1018 }
Herbert Xu1470ddf2011-03-01 02:36:47 +00001019 cork->off += copy;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001020 frag->size += copy;
1021 skb->len += copy;
1022 skb->data_len += copy;
Herbert Xuf945fa72008-01-22 22:39:26 -08001023 skb->truesize += copy;
1024 atomic_add(copy, &sk->sk_wmem_alloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001025 }
1026 offset += copy;
1027 length -= copy;
1028 }
1029
1030 return 0;
1031
1032error:
Herbert Xu1470ddf2011-03-01 02:36:47 +00001033 cork->length -= length;
Pavel Emelyanov5e38e272008-07-16 20:19:49 -07001034 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001035 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001036}
1037
Herbert Xu1470ddf2011-03-01 02:36:47 +00001038static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1039 struct ipcm_cookie *ipc, struct rtable **rtp)
1040{
1041 struct inet_sock *inet = inet_sk(sk);
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001042 struct ip_options_rcu *opt;
Herbert Xu1470ddf2011-03-01 02:36:47 +00001043 struct rtable *rt;
1044
1045 /*
1046 * setup for corking.
1047 */
1048 opt = ipc->opt;
1049 if (opt) {
1050 if (cork->opt == NULL) {
1051 cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1052 sk->sk_allocation);
1053 if (unlikely(cork->opt == NULL))
1054 return -ENOBUFS;
1055 }
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001056 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
Herbert Xu1470ddf2011-03-01 02:36:47 +00001057 cork->flags |= IPCORK_OPT;
1058 cork->addr = ipc->addr;
1059 }
1060 rt = *rtp;
1061 if (unlikely(!rt))
1062 return -EFAULT;
1063 /*
1064 * We steal reference to this route, caller should not release it
1065 */
1066 *rtp = NULL;
1067 cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
Steffen Klassert353e5c92011-06-22 01:05:37 +00001068 rt->dst.dev->mtu : dst_mtu(&rt->dst);
Herbert Xu1470ddf2011-03-01 02:36:47 +00001069 cork->dst = &rt->dst;
1070 cork->length = 0;
1071 cork->tx_flags = ipc->tx_flags;
1072 cork->page = NULL;
1073 cork->off = 0;
1074
1075 return 0;
1076}
1077
1078/*
1079 * ip_append_data() and ip_append_page() can make one large IP datagram
1080 * from many pieces of data. Each pieces will be holded on the socket
1081 * until ip_push_pending_frames() is called. Each piece can be a page
1082 * or non-page data.
1083 *
1084 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1085 * this interface potentially.
1086 *
1087 * LATER: length must be adjusted by pad at tail, when it is required.
1088 */
David S. Millerf5fca602011-05-08 17:24:10 -07001089int ip_append_data(struct sock *sk, struct flowi4 *fl4,
Herbert Xu1470ddf2011-03-01 02:36:47 +00001090 int getfrag(void *from, char *to, int offset, int len,
1091 int odd, struct sk_buff *skb),
1092 void *from, int length, int transhdrlen,
1093 struct ipcm_cookie *ipc, struct rtable **rtp,
1094 unsigned int flags)
1095{
1096 struct inet_sock *inet = inet_sk(sk);
1097 int err;
1098
1099 if (flags&MSG_PROBE)
1100 return 0;
1101
1102 if (skb_queue_empty(&sk->sk_write_queue)) {
David S. Millerbdc712b2011-05-06 15:02:07 -07001103 err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
Herbert Xu1470ddf2011-03-01 02:36:47 +00001104 if (err)
1105 return err;
1106 } else {
1107 transhdrlen = 0;
1108 }
1109
David S. Millerf5fca602011-05-08 17:24:10 -07001110 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
Herbert Xu1470ddf2011-03-01 02:36:47 +00001111 from, length, transhdrlen, flags);
1112}
1113
David S. Millerf5fca602011-05-08 17:24:10 -07001114ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001115 int offset, size_t size, int flags)
1116{
1117 struct inet_sock *inet = inet_sk(sk);
1118 struct sk_buff *skb;
1119 struct rtable *rt;
1120 struct ip_options *opt = NULL;
David S. Millerbdc712b2011-05-06 15:02:07 -07001121 struct inet_cork *cork;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001122 int hh_len;
1123 int mtu;
1124 int len;
1125 int err;
1126 unsigned int maxfraglen, fragheaderlen, fraggap;
1127
1128 if (inet->hdrincl)
1129 return -EPERM;
1130
1131 if (flags&MSG_PROBE)
1132 return 0;
1133
1134 if (skb_queue_empty(&sk->sk_write_queue))
1135 return -EINVAL;
1136
David S. Millerbdc712b2011-05-06 15:02:07 -07001137 cork = &inet->cork.base;
1138 rt = (struct rtable *)cork->dst;
1139 if (cork->flags & IPCORK_OPT)
1140 opt = cork->opt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001141
Changli Gaod8d1f302010-06-10 23:31:35 -07001142 if (!(rt->dst.dev->features&NETIF_F_SG))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001143 return -EOPNOTSUPP;
1144
Changli Gaod8d1f302010-06-10 23:31:35 -07001145 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
David S. Millerbdc712b2011-05-06 15:02:07 -07001146 mtu = cork->fragsize;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001147
1148 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1149 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1150
David S. Millerbdc712b2011-05-06 15:02:07 -07001151 if (cork->length + size > 0xFFFF - fragheaderlen) {
David S. Millerf5fca602011-05-08 17:24:10 -07001152 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001153 return -EMSGSIZE;
1154 }
1155
1156 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1157 return -EINVAL;
1158
David S. Millerbdc712b2011-05-06 15:02:07 -07001159 cork->length += size;
Herbert Xu26cde9f2010-06-15 01:52:25 +00001160 if ((size + skb->len > mtu) &&
1161 (sk->sk_protocol == IPPROTO_UDP) &&
Changli Gaod8d1f302010-06-10 23:31:35 -07001162 (rt->dst.dev->features & NETIF_F_UFO)) {
Herbert Xu79671682006-06-22 02:40:14 -07001163 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
Herbert Xuf83ef8c2006-06-30 13:37:03 -07001164 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
Herbert Xu79671682006-06-22 02:40:14 -07001165 }
Ananda Rajue89e9cf2005-10-18 15:46:41 -07001166
Linus Torvalds1da177e2005-04-16 15:20:36 -07001167
1168 while (size > 0) {
1169 int i;
1170
Herbert Xu89114af2006-07-08 13:34:32 -07001171 if (skb_is_gso(skb))
Ananda Rajue89e9cf2005-10-18 15:46:41 -07001172 len = size;
1173 else {
1174
1175 /* Check if the remaining data fits into current packet. */
1176 len = mtu - skb->len;
1177 if (len < size)
1178 len = maxfraglen - skb->len;
1179 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001180 if (len <= 0) {
1181 struct sk_buff *skb_prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001182 int alloclen;
1183
1184 skb_prev = skb;
Jayachandran C0d0d2bb2005-10-13 11:43:02 -07001185 fraggap = skb_prev->len - maxfraglen;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001186
1187 alloclen = fragheaderlen + hh_len + fraggap + 15;
1188 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1189 if (unlikely(!skb)) {
1190 err = -ENOBUFS;
1191 goto error;
1192 }
1193
1194 /*
1195 * Fill in the control structures
1196 */
1197 skb->ip_summed = CHECKSUM_NONE;
1198 skb->csum = 0;
1199 skb_reserve(skb, hh_len);
1200
1201 /*
1202 * Find where to start putting bytes.
1203 */
Arnaldo Carvalho de Melo967b05f2007-03-13 13:51:52 -03001204 skb_put(skb, fragheaderlen + fraggap);
Arnaldo Carvalho de Melo2ca9e6f2007-03-10 19:15:25 -03001205 skb_reset_network_header(skb);
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -07001206 skb->transport_header = (skb->network_header +
1207 fragheaderlen);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001208 if (fraggap) {
Arnaldo Carvalho de Melo967b05f2007-03-13 13:51:52 -03001209 skb->csum = skb_copy_and_csum_bits(skb_prev,
1210 maxfraglen,
Arnaldo Carvalho de Melo9c702202007-04-25 18:04:18 -07001211 skb_transport_header(skb),
Arnaldo Carvalho de Melo967b05f2007-03-13 13:51:52 -03001212 fraggap, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001213 skb_prev->csum = csum_sub(skb_prev->csum,
1214 skb->csum);
Herbert Xue9fa4f72006-08-13 20:12:58 -07001215 pskb_trim_unique(skb_prev, maxfraglen);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001216 }
1217
1218 /*
1219 * Put the packet on the pending queue.
1220 */
1221 __skb_queue_tail(&sk->sk_write_queue, skb);
1222 continue;
1223 }
1224
1225 i = skb_shinfo(skb)->nr_frags;
1226 if (len > size)
1227 len = size;
1228 if (skb_can_coalesce(skb, i, page, offset)) {
1229 skb_shinfo(skb)->frags[i-1].size += len;
1230 } else if (i < MAX_SKB_FRAGS) {
1231 get_page(page);
1232 skb_fill_page_desc(skb, i, page, offset, len);
1233 } else {
1234 err = -EMSGSIZE;
1235 goto error;
1236 }
1237
1238 if (skb->ip_summed == CHECKSUM_NONE) {
Al Viro44bb9362006-11-14 21:36:14 -08001239 __wsum csum;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001240 csum = csum_page(page, offset, len);
1241 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1242 }
1243
1244 skb->len += len;
1245 skb->data_len += len;
David S. Miller1e34a112008-01-22 23:44:31 -08001246 skb->truesize += len;
1247 atomic_add(len, &sk->sk_wmem_alloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001248 offset += len;
1249 size -= len;
1250 }
1251 return 0;
1252
1253error:
David S. Millerbdc712b2011-05-06 15:02:07 -07001254 cork->length -= size;
Pavel Emelyanov5e38e272008-07-16 20:19:49 -07001255 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001256 return err;
1257}
1258
Herbert Xu1470ddf2011-03-01 02:36:47 +00001259static void ip_cork_release(struct inet_cork *cork)
Pavel Emelyanov429f08e2007-11-05 21:03:24 -08001260{
Herbert Xu1470ddf2011-03-01 02:36:47 +00001261 cork->flags &= ~IPCORK_OPT;
1262 kfree(cork->opt);
1263 cork->opt = NULL;
1264 dst_release(cork->dst);
1265 cork->dst = NULL;
Pavel Emelyanov429f08e2007-11-05 21:03:24 -08001266}
1267
Linus Torvalds1da177e2005-04-16 15:20:36 -07001268/*
1269 * Combined all pending IP fragments on the socket as one IP datagram
1270 * and push them out.
1271 */
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001272struct sk_buff *__ip_make_skb(struct sock *sk,
David S. Miller77968b72011-05-08 17:12:19 -07001273 struct flowi4 *fl4,
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001274 struct sk_buff_head *queue,
1275 struct inet_cork *cork)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001276{
1277 struct sk_buff *skb, *tmp_skb;
1278 struct sk_buff **tail_skb;
1279 struct inet_sock *inet = inet_sk(sk);
Pavel Emelyanov0388b002008-07-14 23:00:43 -07001280 struct net *net = sock_net(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001281 struct ip_options *opt = NULL;
Herbert Xu1470ddf2011-03-01 02:36:47 +00001282 struct rtable *rt = (struct rtable *)cork->dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001283 struct iphdr *iph;
Alexey Dobriyan76ab6082006-01-06 13:24:29 -08001284 __be16 df = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001285 __u8 ttl;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001286
Herbert Xu1470ddf2011-03-01 02:36:47 +00001287 if ((skb = __skb_dequeue(queue)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001288 goto out;
1289 tail_skb = &(skb_shinfo(skb)->frag_list);
1290
1291 /* move skb->data to ip header from ext header */
Arnaldo Carvalho de Melod56f90a2007-04-10 20:50:43 -07001292 if (skb->data < skb_network_header(skb))
Arnaldo Carvalho de Melobbe735e2007-03-10 22:16:10 -03001293 __skb_pull(skb, skb_network_offset(skb));
Herbert Xu1470ddf2011-03-01 02:36:47 +00001294 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
Arnaldo Carvalho de Melocfe1fc72007-03-16 17:26:39 -03001295 __skb_pull(tmp_skb, skb_network_header_len(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001296 *tail_skb = tmp_skb;
1297 tail_skb = &(tmp_skb->next);
1298 skb->len += tmp_skb->len;
1299 skb->data_len += tmp_skb->len;
1300 skb->truesize += tmp_skb->truesize;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001301 tmp_skb->destructor = NULL;
1302 tmp_skb->sk = NULL;
1303 }
1304
1305 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1306 * to fragment the frame generated here. No matter, what transforms
1307 * how transforms change size of the packet, it will come out.
1308 */
John Heffner628a5c52007-04-20 15:53:27 -07001309 if (inet->pmtudisc < IP_PMTUDISC_DO)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001310 skb->local_df = 1;
1311
1312 /* DF bit is set when we want to see DF on outgoing frames.
1313 * If local_df is set too, we still allow to fragment this frame
1314 * locally. */
John Heffner628a5c52007-04-20 15:53:27 -07001315 if (inet->pmtudisc >= IP_PMTUDISC_DO ||
Changli Gaod8d1f302010-06-10 23:31:35 -07001316 (skb->len <= dst_mtu(&rt->dst) &&
1317 ip_dont_fragment(sk, &rt->dst)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001318 df = htons(IP_DF);
1319
Herbert Xu1470ddf2011-03-01 02:36:47 +00001320 if (cork->flags & IPCORK_OPT)
1321 opt = cork->opt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001322
1323 if (rt->rt_type == RTN_MULTICAST)
1324 ttl = inet->mc_ttl;
1325 else
Changli Gaod8d1f302010-06-10 23:31:35 -07001326 ttl = ip_select_ttl(inet, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001327
1328 iph = (struct iphdr *)skb->data;
1329 iph->version = 4;
1330 iph->ihl = 5;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001331 iph->tos = inet->tos;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001332 iph->frag_off = df;
Changli Gaod8d1f302010-06-10 23:31:35 -07001333 ip_select_ident(iph, &rt->dst, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001334 iph->ttl = ttl;
1335 iph->protocol = sk->sk_protocol;
David S. Miller77968b72011-05-08 17:12:19 -07001336 iph->saddr = fl4->saddr;
1337 iph->daddr = fl4->daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001338
David S. Miller22f728f2011-05-13 17:21:27 -04001339 if (opt) {
1340 iph->ihl += opt->optlen>>2;
1341 ip_options_build(skb, opt, cork->addr, rt, 0);
1342 }
1343
Linus Torvalds1da177e2005-04-16 15:20:36 -07001344 skb->priority = sk->sk_priority;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -08001345 skb->mark = sk->sk_mark;
Eric Dumazeta21bba92008-11-24 16:07:50 -08001346 /*
1347 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1348 * on dst refcount
1349 */
Herbert Xu1470ddf2011-03-01 02:36:47 +00001350 cork->dst = NULL;
Changli Gaod8d1f302010-06-10 23:31:35 -07001351 skb_dst_set(skb, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001352
David L Stevens96793b42007-09-17 09:57:33 -07001353 if (iph->protocol == IPPROTO_ICMP)
Pavel Emelyanov0388b002008-07-14 23:00:43 -07001354 icmp_out_count(net, ((struct icmphdr *)
David L Stevens96793b42007-09-17 09:57:33 -07001355 skb_transport_header(skb))->type);
1356
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001357 ip_cork_release(cork);
1358out:
1359 return skb;
1360}
1361
1362int ip_send_skb(struct sk_buff *skb)
1363{
1364 struct net *net = sock_net(skb->sk);
1365 int err;
1366
Herbert Xuc439cb22008-01-11 19:14:00 -08001367 err = ip_local_out(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001368 if (err) {
1369 if (err > 0)
Eric Dumazet6ce9e7b2009-09-02 18:05:33 -07001370 err = net_xmit_errno(err);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001371 if (err)
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001372 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001373 }
1374
Linus Torvalds1da177e2005-04-16 15:20:36 -07001375 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001376}
1377
David S. Miller77968b72011-05-08 17:12:19 -07001378int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
Herbert Xu1470ddf2011-03-01 02:36:47 +00001379{
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001380 struct sk_buff *skb;
1381
David S. Miller77968b72011-05-08 17:12:19 -07001382 skb = ip_finish_skb(sk, fl4);
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001383 if (!skb)
1384 return 0;
1385
1386 /* Netfilter gets whole the not fragmented skb. */
1387 return ip_send_skb(skb);
Herbert Xu1470ddf2011-03-01 02:36:47 +00001388}
1389
Linus Torvalds1da177e2005-04-16 15:20:36 -07001390/*
1391 * Throw away all pending data on the socket.
1392 */
Herbert Xu1470ddf2011-03-01 02:36:47 +00001393static void __ip_flush_pending_frames(struct sock *sk,
1394 struct sk_buff_head *queue,
1395 struct inet_cork *cork)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001396{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001397 struct sk_buff *skb;
1398
Herbert Xu1470ddf2011-03-01 02:36:47 +00001399 while ((skb = __skb_dequeue_tail(queue)) != NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001400 kfree_skb(skb);
1401
Herbert Xu1470ddf2011-03-01 02:36:47 +00001402 ip_cork_release(cork);
1403}
1404
1405void ip_flush_pending_frames(struct sock *sk)
1406{
David S. Millerbdc712b2011-05-06 15:02:07 -07001407 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001408}
1409
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001410struct sk_buff *ip_make_skb(struct sock *sk,
David S. Miller77968b72011-05-08 17:12:19 -07001411 struct flowi4 *fl4,
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001412 int getfrag(void *from, char *to, int offset,
1413 int len, int odd, struct sk_buff *skb),
1414 void *from, int length, int transhdrlen,
1415 struct ipcm_cookie *ipc, struct rtable **rtp,
1416 unsigned int flags)
1417{
David S. Millerb80d7222011-05-06 15:06:01 -07001418 struct inet_cork cork;
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001419 struct sk_buff_head queue;
1420 int err;
1421
1422 if (flags & MSG_PROBE)
1423 return NULL;
1424
1425 __skb_queue_head_init(&queue);
1426
David S. Millerb80d7222011-05-06 15:06:01 -07001427 cork.flags = 0;
1428 cork.addr = 0;
David S. Miller70652722011-05-06 16:01:15 -07001429 cork.opt = NULL;
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001430 err = ip_setup_cork(sk, &cork, ipc, rtp);
1431 if (err)
1432 return ERR_PTR(err);
1433
David S. Millerf5fca602011-05-08 17:24:10 -07001434 err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001435 from, length, transhdrlen, flags);
1436 if (err) {
1437 __ip_flush_pending_frames(sk, &queue, &cork);
1438 return ERR_PTR(err);
1439 }
1440
David S. Miller77968b72011-05-08 17:12:19 -07001441 return __ip_make_skb(sk, fl4, &queue, &cork);
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001442}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001443
1444/*
1445 * Fetch data from kernel space and fill in checksum if needed.
1446 */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001447static int ip_reply_glue_bits(void *dptr, char *to, int offset,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001448 int len, int odd, struct sk_buff *skb)
1449{
Al Viro50842052006-11-14 21:36:34 -08001450 __wsum csum;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001451
1452 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1453 skb->csum = csum_block_add(skb->csum, csum, odd);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001454 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001455}
1456
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001457/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001458 * Generic function to send a packet as reply to another packet.
1459 * Used to send TCP resets so far. ICMP should use this function too.
1460 *
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001461 * Should run single threaded per socket because it uses the sock
Linus Torvalds1da177e2005-04-16 15:20:36 -07001462 * structure to pass arguments.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001463 */
David S. Miller0a5ebb82011-05-09 13:22:43 -07001464void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1465 struct ip_reply_arg *arg, unsigned int len)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001466{
1467 struct inet_sock *inet = inet_sk(sk);
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001468 struct ip_options_data replyopts;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001469 struct ipcm_cookie ipc;
David S. Miller77968b72011-05-08 17:12:19 -07001470 struct flowi4 fl4;
Eric Dumazet511c3f92009-06-02 05:14:27 +00001471 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001472
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001473 if (ip_options_echo(&replyopts.opt.opt, skb))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001474 return;
1475
David S. Miller0a5ebb82011-05-09 13:22:43 -07001476 ipc.addr = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001477 ipc.opt = NULL;
Oliver Hartkopp2244d072010-08-17 08:59:14 +00001478 ipc.tx_flags = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001479
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001480 if (replyopts.opt.opt.optlen) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001481 ipc.opt = &replyopts.opt;
1482
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001483 if (replyopts.opt.opt.srr)
1484 daddr = replyopts.opt.opt.faddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001485 }
1486
David S. Miller77968b72011-05-08 17:12:19 -07001487 flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1488 RT_TOS(ip_hdr(skb)->tos),
1489 RT_SCOPE_UNIVERSE, sk->sk_protocol,
1490 ip_reply_arg_flowi_flags(arg),
1491 daddr, rt->rt_spec_dst,
1492 tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1493 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1494 rt = ip_route_output_key(sock_net(sk), &fl4);
1495 if (IS_ERR(rt))
1496 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001497
1498 /* And let IP do all the hard work.
1499
1500 This chunk is not reenterable, hence spinlock.
1501 Note that it uses the fact, that this function is called
1502 with locally disabled BH and that sk cannot be already spinlocked.
1503 */
1504 bh_lock_sock(sk);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001505 inet->tos = ip_hdr(skb)->tos;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001506 sk->sk_priority = skb->priority;
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001507 sk->sk_protocol = ip_hdr(skb)->protocol;
Patrick McHardyf0e48db2007-06-04 21:32:46 -07001508 sk->sk_bound_dev_if = arg->bound_dev_if;
David S. Millerf5fca602011-05-08 17:24:10 -07001509 ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
Eric Dumazet2e77d892008-11-24 15:52:46 -08001510 &ipc, &rt, MSG_DONTWAIT);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001511 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1512 if (arg->csumoffset >= 0)
Arnaldo Carvalho de Melo9c702202007-04-25 18:04:18 -07001513 *((__sum16 *)skb_transport_header(skb) +
1514 arg->csumoffset) = csum_fold(csum_add(skb->csum,
1515 arg->csum));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001516 skb->ip_summed = CHECKSUM_NONE;
David S. Miller77968b72011-05-08 17:12:19 -07001517 ip_push_pending_frames(sk, &fl4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001518 }
1519
1520 bh_unlock_sock(sk);
1521
1522 ip_rt_put(rt);
1523}
1524
Linus Torvalds1da177e2005-04-16 15:20:36 -07001525void __init ip_init(void)
1526{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001527 ip_rt_init();
1528 inet_initpeers();
1529
1530#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1531 igmp_mc_proc_init();
1532#endif
1533}