blob: 6e9a266a0535270d985445b52f04fd622eeeb24a [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * The Internet Protocol (IP) output module.
7 *
Jesper Juhl02c30a82005-05-05 16:16:16 -07008 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -07009 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Donald Becker, <becker@super.org>
11 * Alan Cox, <Alan.Cox@linux.org>
12 * Richard Underwood
13 * Stefan Becker, <stefanb@yello.ping.de>
14 * Jorge Cwik, <jorge@laser.satlink.net>
15 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 * Hirokazu Takahashi, <taka@valinux.co.jp>
17 *
18 * See ip_input.c for original log
19 *
20 * Fixes:
21 * Alan Cox : Missing nonblock feature in ip_build_xmit.
22 * Mike Kilburn : htons() missing in ip_build_xmit.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090023 * Bradford Johnson: Fix faulty handling of some frames when
Linus Torvalds1da177e2005-04-16 15:20:36 -070024 * no route is found.
25 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
26 * (in case if packet not accepted by
27 * output firewall rules)
28 * Mike McLagan : Routing by source
29 * Alexey Kuznetsov: use new route cache
30 * Andi Kleen: Fix broken PMTU recovery and remove
31 * some redundant tests.
32 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
33 * Andi Kleen : Replace ip_reply with ip_send_reply.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090034 * Andi Kleen : Split fast and slow ip_build_xmit path
35 * for decreased register pressure on x86
36 * and more readibility.
Linus Torvalds1da177e2005-04-16 15:20:36 -070037 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
38 * silently drop skb instead of failing with -EPERM.
39 * Detlev Wengorz : Copy protocol for fragments.
40 * Hirokazu Takahashi: HW checksumming for outgoing UDP
41 * datagrams.
42 * Hirokazu Takahashi: sendfile() on UDP works now.
43 */
44
45#include <asm/uaccess.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070046#include <linux/module.h>
47#include <linux/types.h>
48#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070049#include <linux/mm.h>
50#include <linux/string.h>
51#include <linux/errno.h>
Al Viroa1f8e7f72006-10-19 16:08:53 -040052#include <linux/highmem.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090053#include <linux/slab.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070054
55#include <linux/socket.h>
56#include <linux/sockios.h>
57#include <linux/in.h>
58#include <linux/inet.h>
59#include <linux/netdevice.h>
60#include <linux/etherdevice.h>
61#include <linux/proc_fs.h>
62#include <linux/stat.h>
63#include <linux/init.h>
64
65#include <net/snmp.h>
66#include <net/ip.h>
67#include <net/protocol.h>
68#include <net/route.h>
Patrick McHardycfacb052006-01-08 22:36:54 -080069#include <net/xfrm.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070070#include <linux/skbuff.h>
71#include <net/sock.h>
72#include <net/arp.h>
73#include <net/icmp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070074#include <net/checksum.h>
75#include <net/inetpeer.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070076#include <linux/igmp.h>
77#include <linux/netfilter_ipv4.h>
78#include <linux/netfilter_bridge.h>
79#include <linux/mroute.h>
80#include <linux/netlink.h>
Arnaldo Carvalho de Melo6cbb0df2005-08-09 19:49:02 -070081#include <linux/tcp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070082
Brian Haleyab32ea52006-09-22 14:15:41 -070083int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
David S. Miller323e1262010-12-12 21:55:08 -080084EXPORT_SYMBOL(sysctl_ip_default_ttl);
Linus Torvalds1da177e2005-04-16 15:20:36 -070085
86/* Generate a checksum for an outgoing IP datagram. */
87__inline__ void ip_send_check(struct iphdr *iph)
88{
89 iph->check = 0;
90 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
91}
Eric Dumazet4bc2f182010-07-09 21:22:10 +000092EXPORT_SYMBOL(ip_send_check);
Linus Torvalds1da177e2005-04-16 15:20:36 -070093
Herbert Xuc439cb22008-01-11 19:14:00 -080094int __ip_local_out(struct sk_buff *skb)
95{
96 struct iphdr *iph = ip_hdr(skb);
97
98 iph->tot_len = htons(skb->len);
99 ip_send_check(iph);
Jan Engelhardt9bbc7682010-03-23 04:07:29 +0100100 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
101 skb_dst(skb)->dev, dst_output);
Herbert Xuc439cb22008-01-11 19:14:00 -0800102}
103
104int ip_local_out(struct sk_buff *skb)
105{
106 int err;
107
108 err = __ip_local_out(skb);
109 if (likely(err == 1))
110 err = dst_output(skb);
111
112 return err;
113}
114EXPORT_SYMBOL_GPL(ip_local_out);
115
Linus Torvalds1da177e2005-04-16 15:20:36 -0700116static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
117{
118 int ttl = inet->uc_ttl;
119
120 if (ttl < 0)
David S. Miller323e1262010-12-12 21:55:08 -0800121 ttl = ip4_dst_hoplimit(dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700122 return ttl;
123}
124
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900125/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700126 * Add an ip header to a skbuff and send it out.
127 *
128 */
129int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000130 __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700131{
132 struct inet_sock *inet = inet_sk(sk);
Eric Dumazet511c3f92009-06-02 05:14:27 +0000133 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700134 struct iphdr *iph;
135
136 /* Build the IP header. */
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000137 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
Arnaldo Carvalho de Melo8856dfa2007-03-10 19:40:39 -0300138 skb_reset_network_header(skb);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700139 iph = ip_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700140 iph->version = 4;
141 iph->ihl = 5;
142 iph->tos = inet->tos;
Changli Gaod8d1f302010-06-10 23:31:35 -0700143 if (ip_dont_fragment(sk, &rt->dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700144 iph->frag_off = htons(IP_DF);
145 else
146 iph->frag_off = 0;
Changli Gaod8d1f302010-06-10 23:31:35 -0700147 iph->ttl = ip_select_ttl(inet, &rt->dst);
David S. Millerdd927a22011-05-04 12:03:30 -0700148 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
149 iph->saddr = saddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700150 iph->protocol = sk->sk_protocol;
Changli Gaod8d1f302010-06-10 23:31:35 -0700151 ip_select_ident(iph, &rt->dst, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700152
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000153 if (opt && opt->opt.optlen) {
154 iph->ihl += opt->opt.optlen>>2;
155 ip_options_build(skb, &opt->opt, daddr, rt, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700156 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700157
158 skb->priority = sk->sk_priority;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800159 skb->mark = sk->sk_mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700160
161 /* Send it out. */
Herbert Xuc439cb22008-01-11 19:14:00 -0800162 return ip_local_out(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700163}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -0700164EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
165
Linus Torvalds1da177e2005-04-16 15:20:36 -0700166static inline int ip_finish_output2(struct sk_buff *skb)
167{
Eric Dumazetadf30902009-06-02 05:19:30 +0000168 struct dst_entry *dst = skb_dst(skb);
Mitsuru Chinen80787eb2007-04-30 00:48:20 -0700169 struct rtable *rt = (struct rtable *)dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700170 struct net_device *dev = dst->dev;
Chuck Leverc2636b42007-10-23 21:07:32 -0700171 unsigned int hh_len = LL_RESERVED_SPACE(dev);
David S. Millerf6b72b62011-07-14 07:53:20 -0700172 struct neighbour *neigh;
David S. Millera263b302012-07-02 02:02:15 -0700173 u32 nexthop;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700174
Neil Hormanedf391f2009-04-27 02:45:02 -0700175 if (rt->rt_type == RTN_MULTICAST) {
176 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
177 } else if (rt->rt_type == RTN_BROADCAST)
178 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
Mitsuru Chinen80787eb2007-04-30 00:48:20 -0700179
Linus Torvalds1da177e2005-04-16 15:20:36 -0700180 /* Be paranoid, rather than too clever. */
Stephen Hemminger3b04ddd2007-10-09 01:40:57 -0700181 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700182 struct sk_buff *skb2;
183
184 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
185 if (skb2 == NULL) {
186 kfree_skb(skb);
187 return -ENOMEM;
188 }
189 if (skb->sk)
190 skb_set_owner_w(skb2, skb->sk);
Eric Dumazet5d0ba552012-06-04 01:17:19 +0000191 consume_skb(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700192 skb = skb2;
193 }
194
David S. Millera263b302012-07-02 02:02:15 -0700195 rcu_read_lock_bh();
196 nexthop = rt->rt_gateway ? rt->rt_gateway : ip_hdr(skb)->daddr;
197 neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
198 if (unlikely(!neigh))
199 neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
Eric Dumazetf2c31e32011-07-29 19:00:53 +0000200 if (neigh) {
201 int res = neigh_output(neigh, skb);
202
David S. Millera263b302012-07-02 02:02:15 -0700203 rcu_read_unlock_bh();
Eric Dumazetf2c31e32011-07-29 19:00:53 +0000204 return res;
205 }
David S. Millera263b302012-07-02 02:02:15 -0700206 rcu_read_unlock_bh();
David S. Miller05e3aa02011-07-16 17:26:00 -0700207
Joe Perchese87cc472012-05-13 21:56:26 +0000208 net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
209 __func__);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700210 kfree_skb(skb);
211 return -EINVAL;
212}
213
John Heffner628a5c52007-04-20 15:53:27 -0700214static inline int ip_skb_dst_mtu(struct sk_buff *skb)
215{
216 struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
217
218 return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
Eric Dumazetadf30902009-06-02 05:19:30 +0000219 skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
John Heffner628a5c52007-04-20 15:53:27 -0700220}
221
Patrick McHardy861d0482007-10-15 01:48:39 -0700222static int ip_finish_output(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700223{
Patrick McHardy5c901da2006-01-06 23:05:36 -0800224#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
225 /* Policy lookup after SNAT yielded a new policy */
Eric Dumazetadf30902009-06-02 05:19:30 +0000226 if (skb_dst(skb)->xfrm != NULL) {
Patrick McHardy48d5cad2006-02-15 15:10:22 -0800227 IPCB(skb)->flags |= IPSKB_REROUTED;
228 return dst_output(skb);
229 }
Patrick McHardy5c901da2006-01-06 23:05:36 -0800230#endif
John Heffner628a5c52007-04-20 15:53:27 -0700231 if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
Patrick McHardy1bd9bef2006-01-05 12:20:59 -0800232 return ip_fragment(skb, ip_finish_output2);
233 else
234 return ip_finish_output2(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700235}
236
237int ip_mc_output(struct sk_buff *skb)
238{
239 struct sock *sk = skb->sk;
Eric Dumazet511c3f92009-06-02 05:14:27 +0000240 struct rtable *rt = skb_rtable(skb);
Changli Gaod8d1f302010-06-10 23:31:35 -0700241 struct net_device *dev = rt->dst.dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700242
243 /*
244 * If the indicated interface is up and running, send the packet.
245 */
Neil Hormanedf391f2009-04-27 02:45:02 -0700246 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700247
248 skb->dev = dev;
249 skb->protocol = htons(ETH_P_IP);
250
251 /*
252 * Multicasts are looped back for other local users
253 */
254
255 if (rt->rt_flags&RTCF_MULTICAST) {
Octavian Purdila7ad68482010-01-06 20:37:01 -0800256 if (sk_mc_loop(sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700257#ifdef CONFIG_IP_MROUTE
258 /* Small optimization: do not loopback not local frames,
259 which returned after forwarding; they will be dropped
260 by ip_mr_input in any case.
261 Note, that local frames are looped back to be delivered
262 to local recipients.
263
264 This check is duplicated in ip_mr_input at the moment.
265 */
Joe Perches9d4fb272009-11-23 10:41:23 -0800266 &&
267 ((rt->rt_flags & RTCF_LOCAL) ||
268 !(IPCB(skb)->flags & IPSKB_FORWARDED))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700269#endif
Joe Perches9d4fb272009-11-23 10:41:23 -0800270 ) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700271 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
272 if (newskb)
Jan Engelhardt9bbc7682010-03-23 04:07:29 +0100273 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
274 newskb, NULL, newskb->dev,
Michel Machado95603e22012-06-12 10:16:35 +0000275 dev_loopback_xmit);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700276 }
277
278 /* Multicasts with ttl 0 must not go beyond the host */
279
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700280 if (ip_hdr(skb)->ttl == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700281 kfree_skb(skb);
282 return 0;
283 }
284 }
285
286 if (rt->rt_flags&RTCF_BROADCAST) {
287 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
288 if (newskb)
Jan Engelhardt9bbc7682010-03-23 04:07:29 +0100289 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
Michel Machado95603e22012-06-12 10:16:35 +0000290 NULL, newskb->dev, dev_loopback_xmit);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700291 }
292
Jan Engelhardt9bbc7682010-03-23 04:07:29 +0100293 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
294 skb->dev, ip_finish_output,
Patrick McHardy48d5cad2006-02-15 15:10:22 -0800295 !(IPCB(skb)->flags & IPSKB_REROUTED));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700296}
297
298int ip_output(struct sk_buff *skb)
299{
Eric Dumazetadf30902009-06-02 05:19:30 +0000300 struct net_device *dev = skb_dst(skb)->dev;
Patrick McHardy1bd9bef2006-01-05 12:20:59 -0800301
Neil Hormanedf391f2009-04-27 02:45:02 -0700302 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700303
Patrick McHardy1bd9bef2006-01-05 12:20:59 -0800304 skb->dev = dev;
305 skb->protocol = htons(ETH_P_IP);
306
Jan Engelhardt9bbc7682010-03-23 04:07:29 +0100307 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900308 ip_finish_output,
Patrick McHardy48d5cad2006-02-15 15:10:22 -0800309 !(IPCB(skb)->flags & IPSKB_REROUTED));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700310}
311
Eric Dumazet84f93072011-11-30 19:00:53 +0000312/*
313 * copy saddr and daddr, possibly using 64bit load/stores
314 * Equivalent to :
315 * iph->saddr = fl4->saddr;
316 * iph->daddr = fl4->daddr;
317 */
318static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
319{
320 BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
321 offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
322 memcpy(&iph->saddr, &fl4->saddr,
323 sizeof(fl4->saddr) + sizeof(fl4->daddr));
324}
325
David S. Millerd9d8da82011-05-06 22:23:20 -0700326int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700327{
David S. Millere89862f2007-01-26 01:04:55 -0800328 struct sock *sk = skb->sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700329 struct inet_sock *inet = inet_sk(sk);
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000330 struct ip_options_rcu *inet_opt;
David S. Millerb57ae012011-05-06 16:24:06 -0700331 struct flowi4 *fl4;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700332 struct rtable *rt;
333 struct iphdr *iph;
Eric Dumazetab6e3fe2010-05-10 11:31:49 +0000334 int res;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700335
336 /* Skip all of this if the packet is already routed,
337 * f.e. by something like SCTP.
338 */
Eric Dumazetab6e3fe2010-05-10 11:31:49 +0000339 rcu_read_lock();
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000340 inet_opt = rcu_dereference(inet->inet_opt);
David S. Millerea4fc0d2011-05-06 22:30:20 -0700341 fl4 = &fl->u.ip4;
Eric Dumazet511c3f92009-06-02 05:14:27 +0000342 rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700343 if (rt != NULL)
344 goto packet_routed;
345
346 /* Make sure we can route this packet. */
347 rt = (struct rtable *)__sk_dst_check(sk, 0);
348 if (rt == NULL) {
Al Viro3ca3c682006-09-27 18:28:07 -0700349 __be32 daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700350
351 /* Use correct destination address if we have options. */
Eric Dumazetc720c7e2009-10-15 06:30:45 +0000352 daddr = inet->inet_daddr;
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000353 if (inet_opt && inet_opt->opt.srr)
354 daddr = inet_opt->opt.faddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700355
David S. Miller78fbfd82011-03-12 00:00:52 -0500356 /* If this fails, retransmit mechanism of transport layer will
357 * keep trying until route appears or the connection times
358 * itself out.
359 */
David S. Millerb57ae012011-05-06 16:24:06 -0700360 rt = ip_route_output_ports(sock_net(sk), fl4, sk,
David S. Miller78fbfd82011-03-12 00:00:52 -0500361 daddr, inet->inet_saddr,
362 inet->inet_dport,
363 inet->inet_sport,
364 sk->sk_protocol,
365 RT_CONN_FLAGS(sk),
366 sk->sk_bound_dev_if);
367 if (IS_ERR(rt))
368 goto no_route;
Changli Gaod8d1f302010-06-10 23:31:35 -0700369 sk_setup_caps(sk, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700370 }
Changli Gaod8d1f302010-06-10 23:31:35 -0700371 skb_dst_set_noref(skb, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700372
373packet_routed:
David S. Millerea4fc0d2011-05-06 22:30:20 -0700374 if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700375 goto no_route;
376
377 /* OK, we know where to send it, allocate and build IP header. */
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000378 skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
Arnaldo Carvalho de Melo8856dfa2007-03-10 19:40:39 -0300379 skb_reset_network_header(skb);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700380 iph = ip_hdr(skb);
Al Viro714e85b2006-11-14 20:51:49 -0800381 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
Changli Gaod8d1f302010-06-10 23:31:35 -0700382 if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700383 iph->frag_off = htons(IP_DF);
384 else
385 iph->frag_off = 0;
Changli Gaod8d1f302010-06-10 23:31:35 -0700386 iph->ttl = ip_select_ttl(inet, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700387 iph->protocol = sk->sk_protocol;
Eric Dumazet84f93072011-11-30 19:00:53 +0000388 ip_copy_addrs(iph, fl4);
389
Linus Torvalds1da177e2005-04-16 15:20:36 -0700390 /* Transport layer set skb->h.foo itself. */
391
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000392 if (inet_opt && inet_opt->opt.optlen) {
393 iph->ihl += inet_opt->opt.optlen >> 2;
394 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700395 }
396
Changli Gaod8d1f302010-06-10 23:31:35 -0700397 ip_select_ident_more(iph, &rt->dst, sk,
Herbert Xu79671682006-06-22 02:40:14 -0700398 (skb_shinfo(skb)->gso_segs ?: 1) - 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700399
Linus Torvalds1da177e2005-04-16 15:20:36 -0700400 skb->priority = sk->sk_priority;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800401 skb->mark = sk->sk_mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700402
Eric Dumazetab6e3fe2010-05-10 11:31:49 +0000403 res = ip_local_out(skb);
404 rcu_read_unlock();
405 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700406
407no_route:
Eric Dumazetab6e3fe2010-05-10 11:31:49 +0000408 rcu_read_unlock();
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700409 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700410 kfree_skb(skb);
411 return -EHOSTUNREACH;
412}
Eric Dumazet4bc2f182010-07-09 21:22:10 +0000413EXPORT_SYMBOL(ip_queue_xmit);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700414
415
416static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
417{
418 to->pkt_type = from->pkt_type;
419 to->priority = from->priority;
420 to->protocol = from->protocol;
Eric Dumazetadf30902009-06-02 05:19:30 +0000421 skb_dst_drop(to);
Eric Dumazetfe76cda2010-07-01 23:48:22 +0000422 skb_dst_copy(to, from);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700423 to->dev = from->dev;
Thomas Graf82e91ff2006-11-09 15:19:14 -0800424 to->mark = from->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700425
426 /* Copy the flags to each fragment. */
427 IPCB(to)->flags = IPCB(from)->flags;
428
429#ifdef CONFIG_NET_SCHED
430 to->tc_index = from->tc_index;
431#endif
Yasuyuki Kozakaie7ac05f2007-03-14 16:44:01 -0700432 nf_copy(to, from);
Jozsef Kadlecsikba9dda32007-07-07 22:21:23 -0700433#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
434 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
435 to->nf_trace = from->nf_trace;
436#endif
Julian Anastasovc98d80e2005-10-22 13:39:21 +0300437#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
438 to->ipvs_property = from->ipvs_property;
439#endif
James Morris984bc162006-06-09 00:29:17 -0700440 skb_copy_secmark(to, from);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700441}
442
443/*
444 * This IP datagram is too large to be sent in one piece. Break it up into
445 * smaller pieces (each of size equal to IP header plus
446 * a block of the data of the original IP data part) that will yet fit in a
447 * single device frame, and queue such a frame for sending.
448 */
449
Jianjun Kongd93191002008-11-03 00:23:42 -0800450int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700451{
452 struct iphdr *iph;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700453 int ptr;
454 struct net_device *dev;
455 struct sk_buff *skb2;
Changli Gaoc893b802010-07-31 13:25:08 +0000456 unsigned int mtu, hlen, left, len, ll_rs;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700457 int offset;
Alexey Dobriyan76ab6082006-01-06 13:24:29 -0800458 __be16 not_last_frag;
Eric Dumazet511c3f92009-06-02 05:14:27 +0000459 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700460 int err = 0;
461
Changli Gaod8d1f302010-06-10 23:31:35 -0700462 dev = rt->dst.dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700463
464 /*
465 * Point into the IP datagram header.
466 */
467
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700468 iph = ip_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700469
470 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700471 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700472 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
John Heffner628a5c52007-04-20 15:53:27 -0700473 htonl(ip_skb_dst_mtu(skb)));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700474 kfree_skb(skb);
475 return -EMSGSIZE;
476 }
477
478 /*
479 * Setup starting values.
480 */
481
482 hlen = iph->ihl * 4;
Changli Gaod8d1f302010-06-10 23:31:35 -0700483 mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
Bart De Schuymer6c79bf02010-04-20 16:22:01 +0200484#ifdef CONFIG_BRIDGE_NETFILTER
485 if (skb->nf_bridge)
486 mtu -= nf_bridge_mtu_reduction(skb);
487#endif
Herbert Xu89cee8b2005-12-13 23:14:27 -0800488 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700489
490 /* When frag_list is given, use it. First, check its validity:
491 * some transformers could create wrong frag_list or break existing
492 * one, it is not prohibited. In this case fall back to copying.
493 *
494 * LATER: this step can be merged to real generation of fragments,
495 * we can switch to copy when see the first bad fragment.
496 */
David S. Miller21dc3302010-08-23 00:13:46 -0700497 if (skb_has_frag_list(skb)) {
Eric Dumazet3d130082010-09-21 08:47:45 +0000498 struct sk_buff *frag, *frag2;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700499 int first_len = skb_pagelen(skb);
500
501 if (first_len - hlen > mtu ||
502 ((first_len - hlen) & 7) ||
Paul Gortmaker56f8a752011-06-21 20:33:34 -0700503 ip_is_fragment(iph) ||
Linus Torvalds1da177e2005-04-16 15:20:36 -0700504 skb_cloned(skb))
505 goto slow_path;
506
David S. Millerd7fcf1a2009-06-09 00:19:37 -0700507 skb_walk_frags(skb, frag) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700508 /* Correct geometry. */
509 if (frag->len > mtu ||
510 ((frag->len & 7) && frag->next) ||
511 skb_headroom(frag) < hlen)
Eric Dumazet3d130082010-09-21 08:47:45 +0000512 goto slow_path_clean;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700513
514 /* Partially cloned skb? */
515 if (skb_shared(frag))
Eric Dumazet3d130082010-09-21 08:47:45 +0000516 goto slow_path_clean;
Herbert Xu2fdba6b2005-05-18 22:52:33 -0700517
518 BUG_ON(frag->sk);
519 if (skb->sk) {
Herbert Xu2fdba6b2005-05-18 22:52:33 -0700520 frag->sk = skb->sk;
521 frag->destructor = sock_wfree;
Herbert Xu2fdba6b2005-05-18 22:52:33 -0700522 }
Eric Dumazet3d130082010-09-21 08:47:45 +0000523 skb->truesize -= frag->truesize;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700524 }
525
526 /* Everything is OK. Generate! */
527
528 err = 0;
529 offset = 0;
530 frag = skb_shinfo(skb)->frag_list;
David S. Millerd7fcf1a2009-06-09 00:19:37 -0700531 skb_frag_list_init(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700532 skb->data_len = first_len - skb_headlen(skb);
533 skb->len = first_len;
534 iph->tot_len = htons(first_len);
535 iph->frag_off = htons(IP_MF);
536 ip_send_check(iph);
537
538 for (;;) {
539 /* Prepare header of the next frame,
540 * before previous one went down. */
541 if (frag) {
542 frag->ip_summed = CHECKSUM_NONE;
Arnaldo Carvalho de Melobadff6d2007-03-13 13:06:52 -0300543 skb_reset_transport_header(frag);
Arnaldo Carvalho de Meloe2d1bca2007-04-10 20:46:21 -0700544 __skb_push(frag, hlen);
545 skb_reset_network_header(frag);
Arnaldo Carvalho de Melod56f90a2007-04-10 20:50:43 -0700546 memcpy(skb_network_header(frag), iph, hlen);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700547 iph = ip_hdr(frag);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700548 iph->tot_len = htons(frag->len);
549 ip_copy_metadata(frag, skb);
550 if (offset == 0)
551 ip_options_fragment(frag);
552 offset += skb->len - hlen;
553 iph->frag_off = htons(offset>>3);
554 if (frag->next != NULL)
555 iph->frag_off |= htons(IP_MF);
556 /* Ready, complete checksum */
557 ip_send_check(iph);
558 }
559
560 err = output(skb);
561
Wei Dongdafee492006-08-02 13:41:21 -0700562 if (!err)
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700563 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700564 if (err || !frag)
565 break;
566
567 skb = frag;
568 frag = skb->next;
569 skb->next = NULL;
570 }
571
572 if (err == 0) {
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700573 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700574 return 0;
575 }
576
577 while (frag) {
578 skb = frag->next;
579 kfree_skb(frag);
580 frag = skb;
581 }
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700582 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700583 return err;
Eric Dumazet3d130082010-09-21 08:47:45 +0000584
585slow_path_clean:
586 skb_walk_frags(skb, frag2) {
587 if (frag2 == frag)
588 break;
589 frag2->sk = NULL;
590 frag2->destructor = NULL;
591 skb->truesize += frag2->truesize;
592 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700593 }
594
595slow_path:
596 left = skb->len - hlen; /* Space per frame */
George Kadianakis49085bd2010-07-06 11:44:12 +0000597 ptr = hlen; /* Where to start from */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700598
Linus Torvalds1da177e2005-04-16 15:20:36 -0700599 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
Stephen Hemminger9bcfcaf2006-08-29 17:48:57 -0700600 * we need to make room for the encapsulating header
601 */
Changli Gaoc893b802010-07-31 13:25:08 +0000602 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
Stephen Hemminger9bcfcaf2006-08-29 17:48:57 -0700603
Linus Torvalds1da177e2005-04-16 15:20:36 -0700604 /*
605 * Fragment the datagram.
606 */
607
608 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
609 not_last_frag = iph->frag_off & htons(IP_MF);
610
611 /*
612 * Keep copying data until we run out.
613 */
614
Stephen Hemminger132adf52007-03-08 20:44:43 -0800615 while (left > 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700616 len = left;
617 /* IF: it doesn't fit, use 'mtu' - the data space left */
618 if (len > mtu)
619 len = mtu;
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300620 /* IF: we are not sending up to and including the packet end
Linus Torvalds1da177e2005-04-16 15:20:36 -0700621 then align the next start on an eight byte boundary */
622 if (len < left) {
623 len &= ~7;
624 }
625 /*
626 * Allocate buffer.
627 */
628
629 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
Patrick McHardy64ce2072005-08-09 20:50:53 -0700630 NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700631 err = -ENOMEM;
632 goto fail;
633 }
634
635 /*
636 * Set up data on packet
637 */
638
639 ip_copy_metadata(skb2, skb);
640 skb_reserve(skb2, ll_rs);
641 skb_put(skb2, len + hlen);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -0700642 skb_reset_network_header(skb2);
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -0700643 skb2->transport_header = skb2->network_header + hlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700644
645 /*
646 * Charge the memory for the fragment to any owner
647 * it might possess
648 */
649
650 if (skb->sk)
651 skb_set_owner_w(skb2, skb->sk);
652
653 /*
654 * Copy the packet header into the new buffer.
655 */
656
Arnaldo Carvalho de Melod626f622007-03-27 18:55:52 -0300657 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700658
659 /*
660 * Copy a block of the IP datagram.
661 */
Arnaldo Carvalho de Melobff9b612007-03-16 17:19:57 -0300662 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700663 BUG();
664 left -= len;
665
666 /*
667 * Fill in the new header fields.
668 */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700669 iph = ip_hdr(skb2);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700670 iph->frag_off = htons((offset >> 3));
671
672 /* ANK: dirty, but effective trick. Upgrade options only if
673 * the segment to be fragmented was THE FIRST (otherwise,
674 * options are already fixed) and make it ONCE
675 * on the initial skb, so that all the following fragments
676 * will inherit fixed options.
677 */
678 if (offset == 0)
679 ip_options_fragment(skb);
680
681 /*
682 * Added AC : If we are fragmenting a fragment that's not the
683 * last fragment then keep MF on each bit
684 */
685 if (left > 0 || not_last_frag)
686 iph->frag_off |= htons(IP_MF);
687 ptr += len;
688 offset += len;
689
690 /*
691 * Put this fragment into the sending queue.
692 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700693 iph->tot_len = htons(len + hlen);
694
695 ip_send_check(iph);
696
697 err = output(skb2);
698 if (err)
699 goto fail;
Wei Dongdafee492006-08-02 13:41:21 -0700700
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700701 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700702 }
Eric Dumazet5d0ba552012-06-04 01:17:19 +0000703 consume_skb(skb);
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700704 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700705 return err;
706
707fail:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900708 kfree_skb(skb);
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700709 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700710 return err;
711}
Patrick McHardy2e2f7ae2006-04-04 13:42:35 -0700712EXPORT_SYMBOL(ip_fragment);
713
Linus Torvalds1da177e2005-04-16 15:20:36 -0700714int
715ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
716{
717 struct iovec *iov = from;
718
Patrick McHardy84fa7932006-08-29 16:44:56 -0700719 if (skb->ip_summed == CHECKSUM_PARTIAL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700720 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
721 return -EFAULT;
722 } else {
Al Viro44bb9362006-11-14 21:36:14 -0800723 __wsum csum = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700724 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
725 return -EFAULT;
726 skb->csum = csum_block_add(skb->csum, csum, odd);
727 }
728 return 0;
729}
Eric Dumazet4bc2f182010-07-09 21:22:10 +0000730EXPORT_SYMBOL(ip_generic_getfrag);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700731
Al Viro44bb9362006-11-14 21:36:14 -0800732static inline __wsum
Linus Torvalds1da177e2005-04-16 15:20:36 -0700733csum_page(struct page *page, int offset, int copy)
734{
735 char *kaddr;
Al Viro44bb9362006-11-14 21:36:14 -0800736 __wsum csum;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700737 kaddr = kmap(page);
738 csum = csum_partial(kaddr + offset, copy, 0);
739 kunmap(page);
740 return csum;
741}
742
Adrian Bunk4b30b1c2005-11-29 16:27:20 -0800743static inline int ip_ufo_append_data(struct sock *sk,
Herbert Xu1470ddf2011-03-01 02:36:47 +0000744 struct sk_buff_head *queue,
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700745 int getfrag(void *from, char *to, int offset, int len,
746 int odd, struct sk_buff *skb),
747 void *from, int length, int hh_len, int fragheaderlen,
Bill Sommerfeldd9be4f72011-07-19 15:22:33 +0000748 int transhdrlen, int maxfraglen, unsigned int flags)
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700749{
750 struct sk_buff *skb;
751 int err;
752
753 /* There is support for UDP fragmentation offload by network
754 * device, so create one single skb packet containing complete
755 * udp datagram
756 */
Herbert Xu1470ddf2011-03-01 02:36:47 +0000757 if ((skb = skb_peek_tail(queue)) == NULL) {
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700758 skb = sock_alloc_send_skb(sk,
759 hh_len + fragheaderlen + transhdrlen + 20,
760 (flags & MSG_DONTWAIT), &err);
761
762 if (skb == NULL)
763 return err;
764
765 /* reserve space for Hardware header */
766 skb_reserve(skb, hh_len);
767
768 /* create space for UDP/IP header */
Jianjun Kongd93191002008-11-03 00:23:42 -0800769 skb_put(skb, fragheaderlen + transhdrlen);
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700770
771 /* initialize network header pointer */
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -0700772 skb_reset_network_header(skb);
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700773
774 /* initialize protocol header pointer */
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -0700775 skb->transport_header = skb->network_header + fragheaderlen;
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700776
Patrick McHardy84fa7932006-08-29 16:44:56 -0700777 skb->ip_summed = CHECKSUM_PARTIAL;
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700778 skb->csum = 0;
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700779
Kostya Bbe9164e2008-04-29 22:36:30 -0700780 /* specify the length of each IP datagram fragment */
Bill Sommerfeldd9be4f72011-07-19 15:22:33 +0000781 skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
Herbert Xuf83ef8c2006-06-30 13:37:03 -0700782 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
Herbert Xu1470ddf2011-03-01 02:36:47 +0000783 __skb_queue_tail(queue, skb);
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700784 }
Kostya Bbe9164e2008-04-29 22:36:30 -0700785
786 return skb_append_datato_frags(sk, skb, getfrag, from,
787 (length - transhdrlen));
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700788}
789
David S. Millerf5fca602011-05-08 17:24:10 -0700790static int __ip_append_data(struct sock *sk,
791 struct flowi4 *fl4,
792 struct sk_buff_head *queue,
Herbert Xu1470ddf2011-03-01 02:36:47 +0000793 struct inet_cork *cork,
794 int getfrag(void *from, char *to, int offset,
795 int len, int odd, struct sk_buff *skb),
796 void *from, int length, int transhdrlen,
797 unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700798{
799 struct inet_sock *inet = inet_sk(sk);
800 struct sk_buff *skb;
801
Herbert Xu07df5292011-03-01 23:00:58 -0800802 struct ip_options *opt = cork->opt;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700803 int hh_len;
804 int exthdrlen;
805 int mtu;
806 int copy;
807 int err;
808 int offset = 0;
809 unsigned int maxfraglen, fragheaderlen;
810 int csummode = CHECKSUM_NONE;
Herbert Xu1470ddf2011-03-01 02:36:47 +0000811 struct rtable *rt = (struct rtable *)cork->dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700812
Steffen Klassert96d73032011-06-05 20:48:47 +0000813 skb = skb_peek_tail(queue);
814
815 exthdrlen = !skb ? rt->dst.header_len : 0;
Herbert Xu07df5292011-03-01 23:00:58 -0800816 mtu = cork->fragsize;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700817
Changli Gaod8d1f302010-06-10 23:31:35 -0700818 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700819
820 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
821 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
822
Herbert Xu1470ddf2011-03-01 02:36:47 +0000823 if (cork->length + length > 0xFFFF - fragheaderlen) {
David S. Millerf5fca602011-05-08 17:24:10 -0700824 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
Eric Dumazetc720c7e2009-10-15 06:30:45 +0000825 mtu-exthdrlen);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700826 return -EMSGSIZE;
827 }
828
829 /*
830 * transhdrlen > 0 means that this is the first fragment and we wish
831 * it won't be fragmented in the future.
832 */
833 if (transhdrlen &&
834 length + fragheaderlen <= mtu &&
Changli Gaod8d1f302010-06-10 23:31:35 -0700835 rt->dst.dev->features & NETIF_F_V4_CSUM &&
Linus Torvalds1da177e2005-04-16 15:20:36 -0700836 !exthdrlen)
Patrick McHardy84fa7932006-08-29 16:44:56 -0700837 csummode = CHECKSUM_PARTIAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700838
Herbert Xu1470ddf2011-03-01 02:36:47 +0000839 cork->length += length;
Herbert Xu26cde9f2010-06-15 01:52:25 +0000840 if (((length > mtu) || (skb && skb_is_gso(skb))) &&
Kostya Bbe9164e2008-04-29 22:36:30 -0700841 (sk->sk_protocol == IPPROTO_UDP) &&
Steffen Klassertc1460662011-06-29 23:19:32 +0000842 (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
Herbert Xu1470ddf2011-03-01 02:36:47 +0000843 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
844 hh_len, fragheaderlen, transhdrlen,
Bill Sommerfeldd9be4f72011-07-19 15:22:33 +0000845 maxfraglen, flags);
Patrick McHardybaa829d2006-03-12 20:35:12 -0800846 if (err)
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700847 goto error;
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700848 return 0;
849 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700850
851 /* So, what's going on in the loop below?
852 *
853 * We use calculated fragment length to generate chained skb,
854 * each of segments is IP fragment ready for sending to network after
855 * adding appropriate IP header.
856 */
857
Herbert Xu26cde9f2010-06-15 01:52:25 +0000858 if (!skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700859 goto alloc_new_skb;
860
861 while (length > 0) {
862 /* Check if the remaining data fits into current packet. */
863 copy = mtu - skb->len;
864 if (copy < length)
865 copy = maxfraglen - skb->len;
866 if (copy <= 0) {
867 char *data;
868 unsigned int datalen;
869 unsigned int fraglen;
870 unsigned int fraggap;
871 unsigned int alloclen;
872 struct sk_buff *skb_prev;
873alloc_new_skb:
874 skb_prev = skb;
875 if (skb_prev)
876 fraggap = skb_prev->len - maxfraglen;
877 else
878 fraggap = 0;
879
880 /*
881 * If remaining data exceeds the mtu,
882 * we know we need more fragment(s).
883 */
884 datalen = length + fraggap;
885 if (datalen > mtu - fragheaderlen)
886 datalen = maxfraglen - fragheaderlen;
887 fraglen = datalen + fragheaderlen;
888
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900889 if ((flags & MSG_MORE) &&
Changli Gaod8d1f302010-06-10 23:31:35 -0700890 !(rt->dst.dev->features&NETIF_F_SG))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700891 alloclen = mtu;
892 else
Eric Dumazet59104f02010-09-20 20:16:27 +0000893 alloclen = fraglen;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700894
Steffen Klassert353e5c92011-06-22 01:05:37 +0000895 alloclen += exthdrlen;
896
Linus Torvalds1da177e2005-04-16 15:20:36 -0700897 /* The last fragment gets additional space at tail.
898 * Note, with MSG_MORE we overallocate on fragments,
899 * because we have no idea what fragment will be
900 * the last.
901 */
Steffen Klassert33f99dc2011-06-22 01:04:37 +0000902 if (datalen == length + fraggap)
Changli Gaod8d1f302010-06-10 23:31:35 -0700903 alloclen += rt->dst.trailer_len;
Steffen Klassert33f99dc2011-06-22 01:04:37 +0000904
Linus Torvalds1da177e2005-04-16 15:20:36 -0700905 if (transhdrlen) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900906 skb = sock_alloc_send_skb(sk,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700907 alloclen + hh_len + 15,
908 (flags & MSG_DONTWAIT), &err);
909 } else {
910 skb = NULL;
911 if (atomic_read(&sk->sk_wmem_alloc) <=
912 2 * sk->sk_sndbuf)
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900913 skb = sock_wmalloc(sk,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700914 alloclen + hh_len + 15, 1,
915 sk->sk_allocation);
916 if (unlikely(skb == NULL))
917 err = -ENOBUFS;
Patrick Ohly51f31ca2009-02-12 05:03:39 +0000918 else
919 /* only the initial fragment is
920 time stamped */
Herbert Xu1470ddf2011-03-01 02:36:47 +0000921 cork->tx_flags = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700922 }
923 if (skb == NULL)
924 goto error;
925
926 /*
927 * Fill in the control structures
928 */
929 skb->ip_summed = csummode;
930 skb->csum = 0;
931 skb_reserve(skb, hh_len);
Herbert Xu1470ddf2011-03-01 02:36:47 +0000932 skb_shinfo(skb)->tx_flags = cork->tx_flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700933
934 /*
935 * Find where to start putting bytes.
936 */
Steffen Klassert353e5c92011-06-22 01:05:37 +0000937 data = skb_put(skb, fraglen + exthdrlen);
Arnaldo Carvalho de Meloc14d2452007-03-11 22:39:41 -0300938 skb_set_network_header(skb, exthdrlen);
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -0700939 skb->transport_header = (skb->network_header +
940 fragheaderlen);
Steffen Klassert353e5c92011-06-22 01:05:37 +0000941 data += fragheaderlen + exthdrlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700942
943 if (fraggap) {
944 skb->csum = skb_copy_and_csum_bits(
945 skb_prev, maxfraglen,
946 data + transhdrlen, fraggap, 0);
947 skb_prev->csum = csum_sub(skb_prev->csum,
948 skb->csum);
949 data += fraggap;
Herbert Xue9fa4f72006-08-13 20:12:58 -0700950 pskb_trim_unique(skb_prev, maxfraglen);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700951 }
952
953 copy = datalen - transhdrlen - fraggap;
954 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
955 err = -EFAULT;
956 kfree_skb(skb);
957 goto error;
958 }
959
960 offset += copy;
961 length -= datalen - fraggap;
962 transhdrlen = 0;
963 exthdrlen = 0;
964 csummode = CHECKSUM_NONE;
965
966 /*
967 * Put the packet on the pending queue.
968 */
Herbert Xu1470ddf2011-03-01 02:36:47 +0000969 __skb_queue_tail(queue, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700970 continue;
971 }
972
973 if (copy > length)
974 copy = length;
975
Changli Gaod8d1f302010-06-10 23:31:35 -0700976 if (!(rt->dst.dev->features&NETIF_F_SG)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700977 unsigned int off;
978
979 off = skb->len;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900980 if (getfrag(from, skb_put(skb, copy),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700981 offset, copy, off, skb) < 0) {
982 __skb_trim(skb, off);
983 err = -EFAULT;
984 goto error;
985 }
986 } else {
987 int i = skb_shinfo(skb)->nr_frags;
988 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
Herbert Xu1470ddf2011-03-01 02:36:47 +0000989 struct page *page = cork->page;
990 int off = cork->off;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700991 unsigned int left;
992
993 if (page && (left = PAGE_SIZE - off) > 0) {
994 if (copy >= left)
995 copy = left;
Ian Campbellaff65da2011-08-22 23:44:59 +0000996 if (page != skb_frag_page(frag)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700997 if (i == MAX_SKB_FRAGS) {
998 err = -EMSGSIZE;
999 goto error;
1000 }
Herbert Xu1470ddf2011-03-01 02:36:47 +00001001 skb_fill_page_desc(skb, i, page, off, 0);
Ian Campbellaff65da2011-08-22 23:44:59 +00001002 skb_frag_ref(skb, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001003 frag = &skb_shinfo(skb)->frags[i];
1004 }
1005 } else if (i < MAX_SKB_FRAGS) {
1006 if (copy > PAGE_SIZE)
1007 copy = PAGE_SIZE;
1008 page = alloc_pages(sk->sk_allocation, 0);
1009 if (page == NULL) {
1010 err = -ENOMEM;
1011 goto error;
1012 }
Herbert Xu1470ddf2011-03-01 02:36:47 +00001013 cork->page = page;
1014 cork->off = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001015
1016 skb_fill_page_desc(skb, i, page, 0, 0);
1017 frag = &skb_shinfo(skb)->frags[i];
Linus Torvalds1da177e2005-04-16 15:20:36 -07001018 } else {
1019 err = -EMSGSIZE;
1020 goto error;
1021 }
Eric Dumazet9e903e02011-10-18 21:00:24 +00001022 if (getfrag(from, skb_frag_address(frag)+skb_frag_size(frag),
Ian Campbellaff65da2011-08-22 23:44:59 +00001023 offset, copy, skb->len, skb) < 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001024 err = -EFAULT;
1025 goto error;
1026 }
Herbert Xu1470ddf2011-03-01 02:36:47 +00001027 cork->off += copy;
Eric Dumazet9e903e02011-10-18 21:00:24 +00001028 skb_frag_size_add(frag, copy);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001029 skb->len += copy;
1030 skb->data_len += copy;
Herbert Xuf945fa72008-01-22 22:39:26 -08001031 skb->truesize += copy;
1032 atomic_add(copy, &sk->sk_wmem_alloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001033 }
1034 offset += copy;
1035 length -= copy;
1036 }
1037
1038 return 0;
1039
1040error:
Herbert Xu1470ddf2011-03-01 02:36:47 +00001041 cork->length -= length;
Pavel Emelyanov5e38e272008-07-16 20:19:49 -07001042 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001043 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001044}
1045
Herbert Xu1470ddf2011-03-01 02:36:47 +00001046static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1047 struct ipcm_cookie *ipc, struct rtable **rtp)
1048{
1049 struct inet_sock *inet = inet_sk(sk);
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001050 struct ip_options_rcu *opt;
Herbert Xu1470ddf2011-03-01 02:36:47 +00001051 struct rtable *rt;
1052
1053 /*
1054 * setup for corking.
1055 */
1056 opt = ipc->opt;
1057 if (opt) {
1058 if (cork->opt == NULL) {
1059 cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1060 sk->sk_allocation);
1061 if (unlikely(cork->opt == NULL))
1062 return -ENOBUFS;
1063 }
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001064 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
Herbert Xu1470ddf2011-03-01 02:36:47 +00001065 cork->flags |= IPCORK_OPT;
1066 cork->addr = ipc->addr;
1067 }
1068 rt = *rtp;
1069 if (unlikely(!rt))
1070 return -EFAULT;
1071 /*
1072 * We steal reference to this route, caller should not release it
1073 */
1074 *rtp = NULL;
1075 cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
Steffen Klassert353e5c92011-06-22 01:05:37 +00001076 rt->dst.dev->mtu : dst_mtu(&rt->dst);
Herbert Xu1470ddf2011-03-01 02:36:47 +00001077 cork->dst = &rt->dst;
1078 cork->length = 0;
1079 cork->tx_flags = ipc->tx_flags;
1080 cork->page = NULL;
1081 cork->off = 0;
1082
1083 return 0;
1084}
1085
1086/*
1087 * ip_append_data() and ip_append_page() can make one large IP datagram
1088 * from many pieces of data. Each pieces will be holded on the socket
1089 * until ip_push_pending_frames() is called. Each piece can be a page
1090 * or non-page data.
1091 *
1092 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1093 * this interface potentially.
1094 *
1095 * LATER: length must be adjusted by pad at tail, when it is required.
1096 */
David S. Millerf5fca602011-05-08 17:24:10 -07001097int ip_append_data(struct sock *sk, struct flowi4 *fl4,
Herbert Xu1470ddf2011-03-01 02:36:47 +00001098 int getfrag(void *from, char *to, int offset, int len,
1099 int odd, struct sk_buff *skb),
1100 void *from, int length, int transhdrlen,
1101 struct ipcm_cookie *ipc, struct rtable **rtp,
1102 unsigned int flags)
1103{
1104 struct inet_sock *inet = inet_sk(sk);
1105 int err;
1106
1107 if (flags&MSG_PROBE)
1108 return 0;
1109
1110 if (skb_queue_empty(&sk->sk_write_queue)) {
David S. Millerbdc712b2011-05-06 15:02:07 -07001111 err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
Herbert Xu1470ddf2011-03-01 02:36:47 +00001112 if (err)
1113 return err;
1114 } else {
1115 transhdrlen = 0;
1116 }
1117
David S. Millerf5fca602011-05-08 17:24:10 -07001118 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
Herbert Xu1470ddf2011-03-01 02:36:47 +00001119 from, length, transhdrlen, flags);
1120}
1121
David S. Millerf5fca602011-05-08 17:24:10 -07001122ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001123 int offset, size_t size, int flags)
1124{
1125 struct inet_sock *inet = inet_sk(sk);
1126 struct sk_buff *skb;
1127 struct rtable *rt;
1128 struct ip_options *opt = NULL;
David S. Millerbdc712b2011-05-06 15:02:07 -07001129 struct inet_cork *cork;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001130 int hh_len;
1131 int mtu;
1132 int len;
1133 int err;
1134 unsigned int maxfraglen, fragheaderlen, fraggap;
1135
1136 if (inet->hdrincl)
1137 return -EPERM;
1138
1139 if (flags&MSG_PROBE)
1140 return 0;
1141
1142 if (skb_queue_empty(&sk->sk_write_queue))
1143 return -EINVAL;
1144
David S. Millerbdc712b2011-05-06 15:02:07 -07001145 cork = &inet->cork.base;
1146 rt = (struct rtable *)cork->dst;
1147 if (cork->flags & IPCORK_OPT)
1148 opt = cork->opt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001149
Changli Gaod8d1f302010-06-10 23:31:35 -07001150 if (!(rt->dst.dev->features&NETIF_F_SG))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001151 return -EOPNOTSUPP;
1152
Changli Gaod8d1f302010-06-10 23:31:35 -07001153 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
David S. Millerbdc712b2011-05-06 15:02:07 -07001154 mtu = cork->fragsize;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001155
1156 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1157 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1158
David S. Millerbdc712b2011-05-06 15:02:07 -07001159 if (cork->length + size > 0xFFFF - fragheaderlen) {
David S. Millerf5fca602011-05-08 17:24:10 -07001160 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001161 return -EMSGSIZE;
1162 }
1163
1164 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1165 return -EINVAL;
1166
David S. Millerbdc712b2011-05-06 15:02:07 -07001167 cork->length += size;
Herbert Xu26cde9f2010-06-15 01:52:25 +00001168 if ((size + skb->len > mtu) &&
1169 (sk->sk_protocol == IPPROTO_UDP) &&
Changli Gaod8d1f302010-06-10 23:31:35 -07001170 (rt->dst.dev->features & NETIF_F_UFO)) {
Herbert Xu79671682006-06-22 02:40:14 -07001171 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
Herbert Xuf83ef8c2006-06-30 13:37:03 -07001172 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
Herbert Xu79671682006-06-22 02:40:14 -07001173 }
Ananda Rajue89e9cf2005-10-18 15:46:41 -07001174
Linus Torvalds1da177e2005-04-16 15:20:36 -07001175
1176 while (size > 0) {
1177 int i;
1178
Herbert Xu89114af2006-07-08 13:34:32 -07001179 if (skb_is_gso(skb))
Ananda Rajue89e9cf2005-10-18 15:46:41 -07001180 len = size;
1181 else {
1182
1183 /* Check if the remaining data fits into current packet. */
1184 len = mtu - skb->len;
1185 if (len < size)
1186 len = maxfraglen - skb->len;
1187 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001188 if (len <= 0) {
1189 struct sk_buff *skb_prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001190 int alloclen;
1191
1192 skb_prev = skb;
Jayachandran C0d0d2bb2005-10-13 11:43:02 -07001193 fraggap = skb_prev->len - maxfraglen;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001194
1195 alloclen = fragheaderlen + hh_len + fraggap + 15;
1196 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1197 if (unlikely(!skb)) {
1198 err = -ENOBUFS;
1199 goto error;
1200 }
1201
1202 /*
1203 * Fill in the control structures
1204 */
1205 skb->ip_summed = CHECKSUM_NONE;
1206 skb->csum = 0;
1207 skb_reserve(skb, hh_len);
1208
1209 /*
1210 * Find where to start putting bytes.
1211 */
Arnaldo Carvalho de Melo967b05f2007-03-13 13:51:52 -03001212 skb_put(skb, fragheaderlen + fraggap);
Arnaldo Carvalho de Melo2ca9e6f2007-03-10 19:15:25 -03001213 skb_reset_network_header(skb);
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -07001214 skb->transport_header = (skb->network_header +
1215 fragheaderlen);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001216 if (fraggap) {
Arnaldo Carvalho de Melo967b05f2007-03-13 13:51:52 -03001217 skb->csum = skb_copy_and_csum_bits(skb_prev,
1218 maxfraglen,
Arnaldo Carvalho de Melo9c702202007-04-25 18:04:18 -07001219 skb_transport_header(skb),
Arnaldo Carvalho de Melo967b05f2007-03-13 13:51:52 -03001220 fraggap, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001221 skb_prev->csum = csum_sub(skb_prev->csum,
1222 skb->csum);
Herbert Xue9fa4f72006-08-13 20:12:58 -07001223 pskb_trim_unique(skb_prev, maxfraglen);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001224 }
1225
1226 /*
1227 * Put the packet on the pending queue.
1228 */
1229 __skb_queue_tail(&sk->sk_write_queue, skb);
1230 continue;
1231 }
1232
1233 i = skb_shinfo(skb)->nr_frags;
1234 if (len > size)
1235 len = size;
1236 if (skb_can_coalesce(skb, i, page, offset)) {
Eric Dumazet9e903e02011-10-18 21:00:24 +00001237 skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001238 } else if (i < MAX_SKB_FRAGS) {
1239 get_page(page);
1240 skb_fill_page_desc(skb, i, page, offset, len);
1241 } else {
1242 err = -EMSGSIZE;
1243 goto error;
1244 }
1245
1246 if (skb->ip_summed == CHECKSUM_NONE) {
Al Viro44bb9362006-11-14 21:36:14 -08001247 __wsum csum;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001248 csum = csum_page(page, offset, len);
1249 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1250 }
1251
1252 skb->len += len;
1253 skb->data_len += len;
David S. Miller1e34a112008-01-22 23:44:31 -08001254 skb->truesize += len;
1255 atomic_add(len, &sk->sk_wmem_alloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001256 offset += len;
1257 size -= len;
1258 }
1259 return 0;
1260
1261error:
David S. Millerbdc712b2011-05-06 15:02:07 -07001262 cork->length -= size;
Pavel Emelyanov5e38e272008-07-16 20:19:49 -07001263 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001264 return err;
1265}
1266
Herbert Xu1470ddf2011-03-01 02:36:47 +00001267static void ip_cork_release(struct inet_cork *cork)
Pavel Emelyanov429f08e2007-11-05 21:03:24 -08001268{
Herbert Xu1470ddf2011-03-01 02:36:47 +00001269 cork->flags &= ~IPCORK_OPT;
1270 kfree(cork->opt);
1271 cork->opt = NULL;
1272 dst_release(cork->dst);
1273 cork->dst = NULL;
Pavel Emelyanov429f08e2007-11-05 21:03:24 -08001274}
1275
Linus Torvalds1da177e2005-04-16 15:20:36 -07001276/*
1277 * Combined all pending IP fragments on the socket as one IP datagram
1278 * and push them out.
1279 */
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001280struct sk_buff *__ip_make_skb(struct sock *sk,
David S. Miller77968b72011-05-08 17:12:19 -07001281 struct flowi4 *fl4,
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001282 struct sk_buff_head *queue,
1283 struct inet_cork *cork)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001284{
1285 struct sk_buff *skb, *tmp_skb;
1286 struct sk_buff **tail_skb;
1287 struct inet_sock *inet = inet_sk(sk);
Pavel Emelyanov0388b002008-07-14 23:00:43 -07001288 struct net *net = sock_net(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001289 struct ip_options *opt = NULL;
Herbert Xu1470ddf2011-03-01 02:36:47 +00001290 struct rtable *rt = (struct rtable *)cork->dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001291 struct iphdr *iph;
Alexey Dobriyan76ab6082006-01-06 13:24:29 -08001292 __be16 df = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001293 __u8 ttl;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001294
Herbert Xu1470ddf2011-03-01 02:36:47 +00001295 if ((skb = __skb_dequeue(queue)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001296 goto out;
1297 tail_skb = &(skb_shinfo(skb)->frag_list);
1298
1299 /* move skb->data to ip header from ext header */
Arnaldo Carvalho de Melod56f90a2007-04-10 20:50:43 -07001300 if (skb->data < skb_network_header(skb))
Arnaldo Carvalho de Melobbe735e2007-03-10 22:16:10 -03001301 __skb_pull(skb, skb_network_offset(skb));
Herbert Xu1470ddf2011-03-01 02:36:47 +00001302 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
Arnaldo Carvalho de Melocfe1fc72007-03-16 17:26:39 -03001303 __skb_pull(tmp_skb, skb_network_header_len(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001304 *tail_skb = tmp_skb;
1305 tail_skb = &(tmp_skb->next);
1306 skb->len += tmp_skb->len;
1307 skb->data_len += tmp_skb->len;
1308 skb->truesize += tmp_skb->truesize;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001309 tmp_skb->destructor = NULL;
1310 tmp_skb->sk = NULL;
1311 }
1312
1313 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1314 * to fragment the frame generated here. No matter, what transforms
1315 * how transforms change size of the packet, it will come out.
1316 */
John Heffner628a5c52007-04-20 15:53:27 -07001317 if (inet->pmtudisc < IP_PMTUDISC_DO)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001318 skb->local_df = 1;
1319
1320 /* DF bit is set when we want to see DF on outgoing frames.
1321 * If local_df is set too, we still allow to fragment this frame
1322 * locally. */
John Heffner628a5c52007-04-20 15:53:27 -07001323 if (inet->pmtudisc >= IP_PMTUDISC_DO ||
Changli Gaod8d1f302010-06-10 23:31:35 -07001324 (skb->len <= dst_mtu(&rt->dst) &&
1325 ip_dont_fragment(sk, &rt->dst)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001326 df = htons(IP_DF);
1327
Herbert Xu1470ddf2011-03-01 02:36:47 +00001328 if (cork->flags & IPCORK_OPT)
1329 opt = cork->opt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001330
1331 if (rt->rt_type == RTN_MULTICAST)
1332 ttl = inet->mc_ttl;
1333 else
Changli Gaod8d1f302010-06-10 23:31:35 -07001334 ttl = ip_select_ttl(inet, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001335
1336 iph = (struct iphdr *)skb->data;
1337 iph->version = 4;
1338 iph->ihl = 5;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001339 iph->tos = inet->tos;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001340 iph->frag_off = df;
Changli Gaod8d1f302010-06-10 23:31:35 -07001341 ip_select_ident(iph, &rt->dst, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001342 iph->ttl = ttl;
1343 iph->protocol = sk->sk_protocol;
Eric Dumazet84f93072011-11-30 19:00:53 +00001344 ip_copy_addrs(iph, fl4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001345
David S. Miller22f728f2011-05-13 17:21:27 -04001346 if (opt) {
1347 iph->ihl += opt->optlen>>2;
1348 ip_options_build(skb, opt, cork->addr, rt, 0);
1349 }
1350
Linus Torvalds1da177e2005-04-16 15:20:36 -07001351 skb->priority = sk->sk_priority;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -08001352 skb->mark = sk->sk_mark;
Eric Dumazeta21bba92008-11-24 16:07:50 -08001353 /*
1354 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1355 * on dst refcount
1356 */
Herbert Xu1470ddf2011-03-01 02:36:47 +00001357 cork->dst = NULL;
Changli Gaod8d1f302010-06-10 23:31:35 -07001358 skb_dst_set(skb, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001359
David L Stevens96793b42007-09-17 09:57:33 -07001360 if (iph->protocol == IPPROTO_ICMP)
Pavel Emelyanov0388b002008-07-14 23:00:43 -07001361 icmp_out_count(net, ((struct icmphdr *)
David L Stevens96793b42007-09-17 09:57:33 -07001362 skb_transport_header(skb))->type);
1363
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001364 ip_cork_release(cork);
1365out:
1366 return skb;
1367}
1368
1369int ip_send_skb(struct sk_buff *skb)
1370{
1371 struct net *net = sock_net(skb->sk);
1372 int err;
1373
Herbert Xuc439cb22008-01-11 19:14:00 -08001374 err = ip_local_out(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001375 if (err) {
1376 if (err > 0)
Eric Dumazet6ce9e7b2009-09-02 18:05:33 -07001377 err = net_xmit_errno(err);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001378 if (err)
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001379 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001380 }
1381
Linus Torvalds1da177e2005-04-16 15:20:36 -07001382 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001383}
1384
David S. Miller77968b72011-05-08 17:12:19 -07001385int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
Herbert Xu1470ddf2011-03-01 02:36:47 +00001386{
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001387 struct sk_buff *skb;
1388
David S. Miller77968b72011-05-08 17:12:19 -07001389 skb = ip_finish_skb(sk, fl4);
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001390 if (!skb)
1391 return 0;
1392
1393 /* Netfilter gets whole the not fragmented skb. */
1394 return ip_send_skb(skb);
Herbert Xu1470ddf2011-03-01 02:36:47 +00001395}
1396
Linus Torvalds1da177e2005-04-16 15:20:36 -07001397/*
1398 * Throw away all pending data on the socket.
1399 */
Herbert Xu1470ddf2011-03-01 02:36:47 +00001400static void __ip_flush_pending_frames(struct sock *sk,
1401 struct sk_buff_head *queue,
1402 struct inet_cork *cork)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001403{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001404 struct sk_buff *skb;
1405
Herbert Xu1470ddf2011-03-01 02:36:47 +00001406 while ((skb = __skb_dequeue_tail(queue)) != NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001407 kfree_skb(skb);
1408
Herbert Xu1470ddf2011-03-01 02:36:47 +00001409 ip_cork_release(cork);
1410}
1411
1412void ip_flush_pending_frames(struct sock *sk)
1413{
David S. Millerbdc712b2011-05-06 15:02:07 -07001414 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001415}
1416
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001417struct sk_buff *ip_make_skb(struct sock *sk,
David S. Miller77968b72011-05-08 17:12:19 -07001418 struct flowi4 *fl4,
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001419 int getfrag(void *from, char *to, int offset,
1420 int len, int odd, struct sk_buff *skb),
1421 void *from, int length, int transhdrlen,
1422 struct ipcm_cookie *ipc, struct rtable **rtp,
1423 unsigned int flags)
1424{
David S. Millerb80d7222011-05-06 15:06:01 -07001425 struct inet_cork cork;
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001426 struct sk_buff_head queue;
1427 int err;
1428
1429 if (flags & MSG_PROBE)
1430 return NULL;
1431
1432 __skb_queue_head_init(&queue);
1433
David S. Millerb80d7222011-05-06 15:06:01 -07001434 cork.flags = 0;
1435 cork.addr = 0;
David S. Miller70652722011-05-06 16:01:15 -07001436 cork.opt = NULL;
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001437 err = ip_setup_cork(sk, &cork, ipc, rtp);
1438 if (err)
1439 return ERR_PTR(err);
1440
David S. Millerf5fca602011-05-08 17:24:10 -07001441 err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001442 from, length, transhdrlen, flags);
1443 if (err) {
1444 __ip_flush_pending_frames(sk, &queue, &cork);
1445 return ERR_PTR(err);
1446 }
1447
David S. Miller77968b72011-05-08 17:12:19 -07001448 return __ip_make_skb(sk, fl4, &queue, &cork);
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001449}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001450
1451/*
1452 * Fetch data from kernel space and fill in checksum if needed.
1453 */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001454static int ip_reply_glue_bits(void *dptr, char *to, int offset,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001455 int len, int odd, struct sk_buff *skb)
1456{
Al Viro50842052006-11-14 21:36:34 -08001457 __wsum csum;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001458
1459 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1460 skb->csum = csum_block_add(skb->csum, csum, odd);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001461 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001462}
1463
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001464/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001465 * Generic function to send a packet as reply to another packet.
David S. Miller70e73412012-06-28 03:21:41 -07001466 * Used to send TCP resets so far.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001467 *
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001468 * Should run single threaded per socket because it uses the sock
Linus Torvalds1da177e2005-04-16 15:20:36 -07001469 * structure to pass arguments.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001470 */
David S. Miller70e73412012-06-28 03:21:41 -07001471void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1472 __be32 saddr, const struct ip_reply_arg *arg,
1473 unsigned int len)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001474{
1475 struct inet_sock *inet = inet_sk(sk);
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001476 struct ip_options_data replyopts;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001477 struct ipcm_cookie ipc;
David S. Miller77968b72011-05-08 17:12:19 -07001478 struct flowi4 fl4;
Eric Dumazet511c3f92009-06-02 05:14:27 +00001479 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001480
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001481 if (ip_options_echo(&replyopts.opt.opt, skb))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001482 return;
1483
David S. Miller0a5ebb82011-05-09 13:22:43 -07001484 ipc.addr = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001485 ipc.opt = NULL;
Oliver Hartkopp2244d072010-08-17 08:59:14 +00001486 ipc.tx_flags = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001487
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001488 if (replyopts.opt.opt.optlen) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001489 ipc.opt = &replyopts.opt;
1490
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001491 if (replyopts.opt.opt.srr)
1492 daddr = replyopts.opt.opt.faddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001493 }
1494
David S. Miller77968b72011-05-08 17:12:19 -07001495 flowi4_init_output(&fl4, arg->bound_dev_if, 0,
Eric Dumazet66b13d92011-10-24 03:06:21 -04001496 RT_TOS(arg->tos),
David S. Miller77968b72011-05-08 17:12:19 -07001497 RT_SCOPE_UNIVERSE, sk->sk_protocol,
1498 ip_reply_arg_flowi_flags(arg),
David S. Miller70e73412012-06-28 03:21:41 -07001499 daddr, saddr,
David S. Miller77968b72011-05-08 17:12:19 -07001500 tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1501 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1502 rt = ip_route_output_key(sock_net(sk), &fl4);
1503 if (IS_ERR(rt))
1504 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001505
1506 /* And let IP do all the hard work.
1507
1508 This chunk is not reenterable, hence spinlock.
1509 Note that it uses the fact, that this function is called
1510 with locally disabled BH and that sk cannot be already spinlocked.
1511 */
1512 bh_lock_sock(sk);
Eric Dumazet66b13d92011-10-24 03:06:21 -04001513 inet->tos = arg->tos;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001514 sk->sk_priority = skb->priority;
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001515 sk->sk_protocol = ip_hdr(skb)->protocol;
Patrick McHardyf0e48db2007-06-04 21:32:46 -07001516 sk->sk_bound_dev_if = arg->bound_dev_if;
David S. Millerf5fca602011-05-08 17:24:10 -07001517 ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
Eric Dumazet2e77d892008-11-24 15:52:46 -08001518 &ipc, &rt, MSG_DONTWAIT);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001519 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1520 if (arg->csumoffset >= 0)
Arnaldo Carvalho de Melo9c702202007-04-25 18:04:18 -07001521 *((__sum16 *)skb_transport_header(skb) +
1522 arg->csumoffset) = csum_fold(csum_add(skb->csum,
1523 arg->csum));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001524 skb->ip_summed = CHECKSUM_NONE;
David S. Miller77968b72011-05-08 17:12:19 -07001525 ip_push_pending_frames(sk, &fl4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001526 }
1527
1528 bh_unlock_sock(sk);
1529
1530 ip_rt_put(rt);
1531}
1532
Linus Torvalds1da177e2005-04-16 15:20:36 -07001533void __init ip_init(void)
1534{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001535 ip_rt_init();
1536 inet_initpeers();
1537
1538#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1539 igmp_mc_proc_init();
1540#endif
1541}