blob: b99ca4e154b96b456e406aa9b60104ff05bccc6b [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * The Internet Protocol (IP) output module.
7 *
Jesper Juhl02c30a82005-05-05 16:16:16 -07008 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -07009 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Donald Becker, <becker@super.org>
11 * Alan Cox, <Alan.Cox@linux.org>
12 * Richard Underwood
13 * Stefan Becker, <stefanb@yello.ping.de>
14 * Jorge Cwik, <jorge@laser.satlink.net>
15 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 * Hirokazu Takahashi, <taka@valinux.co.jp>
17 *
18 * See ip_input.c for original log
19 *
20 * Fixes:
21 * Alan Cox : Missing nonblock feature in ip_build_xmit.
22 * Mike Kilburn : htons() missing in ip_build_xmit.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090023 * Bradford Johnson: Fix faulty handling of some frames when
Linus Torvalds1da177e2005-04-16 15:20:36 -070024 * no route is found.
25 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
26 * (in case if packet not accepted by
27 * output firewall rules)
28 * Mike McLagan : Routing by source
29 * Alexey Kuznetsov: use new route cache
30 * Andi Kleen: Fix broken PMTU recovery and remove
31 * some redundant tests.
32 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
33 * Andi Kleen : Replace ip_reply with ip_send_reply.
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +090034 * Andi Kleen : Split fast and slow ip_build_xmit path
35 * for decreased register pressure on x86
36 * and more readibility.
Linus Torvalds1da177e2005-04-16 15:20:36 -070037 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
38 * silently drop skb instead of failing with -EPERM.
39 * Detlev Wengorz : Copy protocol for fragments.
40 * Hirokazu Takahashi: HW checksumming for outgoing UDP
41 * datagrams.
42 * Hirokazu Takahashi: sendfile() on UDP works now.
43 */
44
45#include <asm/uaccess.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070046#include <linux/module.h>
47#include <linux/types.h>
48#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070049#include <linux/mm.h>
50#include <linux/string.h>
51#include <linux/errno.h>
Al Viroa1f8e7f72006-10-19 16:08:53 -040052#include <linux/highmem.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090053#include <linux/slab.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070054
55#include <linux/socket.h>
56#include <linux/sockios.h>
57#include <linux/in.h>
58#include <linux/inet.h>
59#include <linux/netdevice.h>
60#include <linux/etherdevice.h>
61#include <linux/proc_fs.h>
62#include <linux/stat.h>
63#include <linux/init.h>
64
65#include <net/snmp.h>
66#include <net/ip.h>
67#include <net/protocol.h>
68#include <net/route.h>
Patrick McHardycfacb052006-01-08 22:36:54 -080069#include <net/xfrm.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070070#include <linux/skbuff.h>
71#include <net/sock.h>
72#include <net/arp.h>
73#include <net/icmp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070074#include <net/checksum.h>
75#include <net/inetpeer.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070076#include <linux/igmp.h>
77#include <linux/netfilter_ipv4.h>
78#include <linux/netfilter_bridge.h>
79#include <linux/mroute.h>
80#include <linux/netlink.h>
Arnaldo Carvalho de Melo6cbb0df2005-08-09 19:49:02 -070081#include <linux/tcp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070082
Brian Haleyab32ea52006-09-22 14:15:41 -070083int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
David S. Miller323e1262010-12-12 21:55:08 -080084EXPORT_SYMBOL(sysctl_ip_default_ttl);
Linus Torvalds1da177e2005-04-16 15:20:36 -070085
86/* Generate a checksum for an outgoing IP datagram. */
87__inline__ void ip_send_check(struct iphdr *iph)
88{
89 iph->check = 0;
90 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
91}
Eric Dumazet4bc2f182010-07-09 21:22:10 +000092EXPORT_SYMBOL(ip_send_check);
Linus Torvalds1da177e2005-04-16 15:20:36 -070093
Herbert Xuc439cb22008-01-11 19:14:00 -080094int __ip_local_out(struct sk_buff *skb)
95{
96 struct iphdr *iph = ip_hdr(skb);
97
98 iph->tot_len = htons(skb->len);
99 ip_send_check(iph);
Jan Engelhardt9bbc7682010-03-23 04:07:29 +0100100 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
101 skb_dst(skb)->dev, dst_output);
Herbert Xuc439cb22008-01-11 19:14:00 -0800102}
103
104int ip_local_out(struct sk_buff *skb)
105{
106 int err;
107
108 err = __ip_local_out(skb);
109 if (likely(err == 1))
110 err = dst_output(skb);
111
112 return err;
113}
114EXPORT_SYMBOL_GPL(ip_local_out);
115
Linus Torvalds1da177e2005-04-16 15:20:36 -0700116/* dev_loopback_xmit for use with netfilter. */
117static int ip_dev_loopback_xmit(struct sk_buff *newskb)
118{
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -0700119 skb_reset_mac_header(newskb);
Arnaldo Carvalho de Melobbe735e2007-03-10 22:16:10 -0300120 __skb_pull(newskb, skb_network_offset(newskb));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700121 newskb->pkt_type = PACKET_LOOPBACK;
122 newskb->ip_summed = CHECKSUM_UNNECESSARY;
Eric Dumazetadf30902009-06-02 05:19:30 +0000123 WARN_ON(!skb_dst(newskb));
Julian Anastasovd52fbfc2011-08-07 10:17:22 +0000124 skb_dst_force(newskb);
Eric Dumazete30b38c2010-04-15 09:13:03 +0000125 netif_rx_ni(newskb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700126 return 0;
127}
128
129static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
130{
131 int ttl = inet->uc_ttl;
132
133 if (ttl < 0)
David S. Miller323e1262010-12-12 21:55:08 -0800134 ttl = ip4_dst_hoplimit(dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700135 return ttl;
136}
137
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900138/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700139 * Add an ip header to a skbuff and send it out.
140 *
141 */
142int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000143 __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700144{
145 struct inet_sock *inet = inet_sk(sk);
Eric Dumazet511c3f92009-06-02 05:14:27 +0000146 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700147 struct iphdr *iph;
148
149 /* Build the IP header. */
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000150 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
Arnaldo Carvalho de Melo8856dfa2007-03-10 19:40:39 -0300151 skb_reset_network_header(skb);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700152 iph = ip_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700153 iph->version = 4;
154 iph->ihl = 5;
155 iph->tos = inet->tos;
Changli Gaod8d1f302010-06-10 23:31:35 -0700156 if (ip_dont_fragment(sk, &rt->dst))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700157 iph->frag_off = htons(IP_DF);
158 else
159 iph->frag_off = 0;
Changli Gaod8d1f302010-06-10 23:31:35 -0700160 iph->ttl = ip_select_ttl(inet, &rt->dst);
David S. Millerdd927a22011-05-04 12:03:30 -0700161 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
162 iph->saddr = saddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700163 iph->protocol = sk->sk_protocol;
Changli Gaod8d1f302010-06-10 23:31:35 -0700164 ip_select_ident(iph, &rt->dst, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700165
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000166 if (opt && opt->opt.optlen) {
167 iph->ihl += opt->opt.optlen>>2;
168 ip_options_build(skb, &opt->opt, daddr, rt, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700169 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700170
171 skb->priority = sk->sk_priority;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800172 skb->mark = sk->sk_mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700173
174 /* Send it out. */
Herbert Xuc439cb22008-01-11 19:14:00 -0800175 return ip_local_out(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700176}
Arnaldo Carvalho de Melod8c97a92005-08-09 20:12:12 -0700177EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
178
Linus Torvalds1da177e2005-04-16 15:20:36 -0700179static inline int ip_finish_output2(struct sk_buff *skb)
180{
Eric Dumazetadf30902009-06-02 05:19:30 +0000181 struct dst_entry *dst = skb_dst(skb);
Mitsuru Chinen80787eb2007-04-30 00:48:20 -0700182 struct rtable *rt = (struct rtable *)dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700183 struct net_device *dev = dst->dev;
Chuck Leverc2636b42007-10-23 21:07:32 -0700184 unsigned int hh_len = LL_RESERVED_SPACE(dev);
David S. Millerf6b72b62011-07-14 07:53:20 -0700185 struct neighbour *neigh;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700186
Neil Hormanedf391f2009-04-27 02:45:02 -0700187 if (rt->rt_type == RTN_MULTICAST) {
188 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
189 } else if (rt->rt_type == RTN_BROADCAST)
190 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
Mitsuru Chinen80787eb2007-04-30 00:48:20 -0700191
Linus Torvalds1da177e2005-04-16 15:20:36 -0700192 /* Be paranoid, rather than too clever. */
Stephen Hemminger3b04ddd2007-10-09 01:40:57 -0700193 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700194 struct sk_buff *skb2;
195
196 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
197 if (skb2 == NULL) {
198 kfree_skb(skb);
199 return -ENOMEM;
200 }
201 if (skb->sk)
202 skb_set_owner_w(skb2, skb->sk);
Eric Dumazet5d0ba552012-06-04 01:17:19 +0000203 consume_skb(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700204 skb = skb2;
205 }
206
Eric Dumazetf2c31e32011-07-29 19:00:53 +0000207 rcu_read_lock();
David Miller27217452011-12-02 16:52:08 +0000208 neigh = dst_get_neighbour_noref(dst);
Eric Dumazetf2c31e32011-07-29 19:00:53 +0000209 if (neigh) {
210 int res = neigh_output(neigh, skb);
211
212 rcu_read_unlock();
213 return res;
214 }
215 rcu_read_unlock();
David S. Miller05e3aa02011-07-16 17:26:00 -0700216
Joe Perchese87cc472012-05-13 21:56:26 +0000217 net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
218 __func__);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700219 kfree_skb(skb);
220 return -EINVAL;
221}
222
John Heffner628a5c52007-04-20 15:53:27 -0700223static inline int ip_skb_dst_mtu(struct sk_buff *skb)
224{
225 struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
226
227 return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
Eric Dumazetadf30902009-06-02 05:19:30 +0000228 skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
John Heffner628a5c52007-04-20 15:53:27 -0700229}
230
Patrick McHardy861d0482007-10-15 01:48:39 -0700231static int ip_finish_output(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700232{
Patrick McHardy5c901da2006-01-06 23:05:36 -0800233#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
234 /* Policy lookup after SNAT yielded a new policy */
Eric Dumazetadf30902009-06-02 05:19:30 +0000235 if (skb_dst(skb)->xfrm != NULL) {
Patrick McHardy48d5cad2006-02-15 15:10:22 -0800236 IPCB(skb)->flags |= IPSKB_REROUTED;
237 return dst_output(skb);
238 }
Patrick McHardy5c901da2006-01-06 23:05:36 -0800239#endif
John Heffner628a5c52007-04-20 15:53:27 -0700240 if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
Patrick McHardy1bd9bef2006-01-05 12:20:59 -0800241 return ip_fragment(skb, ip_finish_output2);
242 else
243 return ip_finish_output2(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700244}
245
246int ip_mc_output(struct sk_buff *skb)
247{
248 struct sock *sk = skb->sk;
Eric Dumazet511c3f92009-06-02 05:14:27 +0000249 struct rtable *rt = skb_rtable(skb);
Changli Gaod8d1f302010-06-10 23:31:35 -0700250 struct net_device *dev = rt->dst.dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700251
252 /*
253 * If the indicated interface is up and running, send the packet.
254 */
Neil Hormanedf391f2009-04-27 02:45:02 -0700255 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700256
257 skb->dev = dev;
258 skb->protocol = htons(ETH_P_IP);
259
260 /*
261 * Multicasts are looped back for other local users
262 */
263
264 if (rt->rt_flags&RTCF_MULTICAST) {
Octavian Purdila7ad68482010-01-06 20:37:01 -0800265 if (sk_mc_loop(sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700266#ifdef CONFIG_IP_MROUTE
267 /* Small optimization: do not loopback not local frames,
268 which returned after forwarding; they will be dropped
269 by ip_mr_input in any case.
270 Note, that local frames are looped back to be delivered
271 to local recipients.
272
273 This check is duplicated in ip_mr_input at the moment.
274 */
Joe Perches9d4fb272009-11-23 10:41:23 -0800275 &&
276 ((rt->rt_flags & RTCF_LOCAL) ||
277 !(IPCB(skb)->flags & IPSKB_FORWARDED))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700278#endif
Joe Perches9d4fb272009-11-23 10:41:23 -0800279 ) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700280 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
281 if (newskb)
Jan Engelhardt9bbc7682010-03-23 04:07:29 +0100282 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
283 newskb, NULL, newskb->dev,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700284 ip_dev_loopback_xmit);
285 }
286
287 /* Multicasts with ttl 0 must not go beyond the host */
288
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700289 if (ip_hdr(skb)->ttl == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700290 kfree_skb(skb);
291 return 0;
292 }
293 }
294
295 if (rt->rt_flags&RTCF_BROADCAST) {
296 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
297 if (newskb)
Jan Engelhardt9bbc7682010-03-23 04:07:29 +0100298 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
299 NULL, newskb->dev, ip_dev_loopback_xmit);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700300 }
301
Jan Engelhardt9bbc7682010-03-23 04:07:29 +0100302 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
303 skb->dev, ip_finish_output,
Patrick McHardy48d5cad2006-02-15 15:10:22 -0800304 !(IPCB(skb)->flags & IPSKB_REROUTED));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700305}
306
307int ip_output(struct sk_buff *skb)
308{
Eric Dumazetadf30902009-06-02 05:19:30 +0000309 struct net_device *dev = skb_dst(skb)->dev;
Patrick McHardy1bd9bef2006-01-05 12:20:59 -0800310
Neil Hormanedf391f2009-04-27 02:45:02 -0700311 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700312
Patrick McHardy1bd9bef2006-01-05 12:20:59 -0800313 skb->dev = dev;
314 skb->protocol = htons(ETH_P_IP);
315
Jan Engelhardt9bbc7682010-03-23 04:07:29 +0100316 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900317 ip_finish_output,
Patrick McHardy48d5cad2006-02-15 15:10:22 -0800318 !(IPCB(skb)->flags & IPSKB_REROUTED));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700319}
320
Eric Dumazet84f93072011-11-30 19:00:53 +0000321/*
322 * copy saddr and daddr, possibly using 64bit load/stores
323 * Equivalent to :
324 * iph->saddr = fl4->saddr;
325 * iph->daddr = fl4->daddr;
326 */
327static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
328{
329 BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
330 offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
331 memcpy(&iph->saddr, &fl4->saddr,
332 sizeof(fl4->saddr) + sizeof(fl4->daddr));
333}
334
David S. Millerd9d8da82011-05-06 22:23:20 -0700335int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700336{
David S. Millere89862f2007-01-26 01:04:55 -0800337 struct sock *sk = skb->sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700338 struct inet_sock *inet = inet_sk(sk);
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000339 struct ip_options_rcu *inet_opt;
David S. Millerb57ae012011-05-06 16:24:06 -0700340 struct flowi4 *fl4;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700341 struct rtable *rt;
342 struct iphdr *iph;
Eric Dumazetab6e3fe2010-05-10 11:31:49 +0000343 int res;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700344
345 /* Skip all of this if the packet is already routed,
346 * f.e. by something like SCTP.
347 */
Eric Dumazetab6e3fe2010-05-10 11:31:49 +0000348 rcu_read_lock();
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000349 inet_opt = rcu_dereference(inet->inet_opt);
David S. Millerea4fc0d2011-05-06 22:30:20 -0700350 fl4 = &fl->u.ip4;
Eric Dumazet511c3f92009-06-02 05:14:27 +0000351 rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700352 if (rt != NULL)
353 goto packet_routed;
354
355 /* Make sure we can route this packet. */
356 rt = (struct rtable *)__sk_dst_check(sk, 0);
357 if (rt == NULL) {
Al Viro3ca3c682006-09-27 18:28:07 -0700358 __be32 daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700359
360 /* Use correct destination address if we have options. */
Eric Dumazetc720c7e2009-10-15 06:30:45 +0000361 daddr = inet->inet_daddr;
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000362 if (inet_opt && inet_opt->opt.srr)
363 daddr = inet_opt->opt.faddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700364
David S. Miller78fbfd82011-03-12 00:00:52 -0500365 /* If this fails, retransmit mechanism of transport layer will
366 * keep trying until route appears or the connection times
367 * itself out.
368 */
David S. Millerb57ae012011-05-06 16:24:06 -0700369 rt = ip_route_output_ports(sock_net(sk), fl4, sk,
David S. Miller78fbfd82011-03-12 00:00:52 -0500370 daddr, inet->inet_saddr,
371 inet->inet_dport,
372 inet->inet_sport,
373 sk->sk_protocol,
374 RT_CONN_FLAGS(sk),
375 sk->sk_bound_dev_if);
376 if (IS_ERR(rt))
377 goto no_route;
Changli Gaod8d1f302010-06-10 23:31:35 -0700378 sk_setup_caps(sk, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700379 }
Changli Gaod8d1f302010-06-10 23:31:35 -0700380 skb_dst_set_noref(skb, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700381
382packet_routed:
David S. Millerea4fc0d2011-05-06 22:30:20 -0700383 if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700384 goto no_route;
385
386 /* OK, we know where to send it, allocate and build IP header. */
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000387 skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
Arnaldo Carvalho de Melo8856dfa2007-03-10 19:40:39 -0300388 skb_reset_network_header(skb);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700389 iph = ip_hdr(skb);
Al Viro714e85b2006-11-14 20:51:49 -0800390 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
Changli Gaod8d1f302010-06-10 23:31:35 -0700391 if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700392 iph->frag_off = htons(IP_DF);
393 else
394 iph->frag_off = 0;
Changli Gaod8d1f302010-06-10 23:31:35 -0700395 iph->ttl = ip_select_ttl(inet, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700396 iph->protocol = sk->sk_protocol;
Eric Dumazet84f93072011-11-30 19:00:53 +0000397 ip_copy_addrs(iph, fl4);
398
Linus Torvalds1da177e2005-04-16 15:20:36 -0700399 /* Transport layer set skb->h.foo itself. */
400
Eric Dumazetf6d8bd02011-04-21 09:45:37 +0000401 if (inet_opt && inet_opt->opt.optlen) {
402 iph->ihl += inet_opt->opt.optlen >> 2;
403 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700404 }
405
Changli Gaod8d1f302010-06-10 23:31:35 -0700406 ip_select_ident_more(iph, &rt->dst, sk,
Herbert Xu79671682006-06-22 02:40:14 -0700407 (skb_shinfo(skb)->gso_segs ?: 1) - 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700408
Linus Torvalds1da177e2005-04-16 15:20:36 -0700409 skb->priority = sk->sk_priority;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800410 skb->mark = sk->sk_mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700411
Eric Dumazetab6e3fe2010-05-10 11:31:49 +0000412 res = ip_local_out(skb);
413 rcu_read_unlock();
414 return res;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700415
416no_route:
Eric Dumazetab6e3fe2010-05-10 11:31:49 +0000417 rcu_read_unlock();
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700418 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700419 kfree_skb(skb);
420 return -EHOSTUNREACH;
421}
Eric Dumazet4bc2f182010-07-09 21:22:10 +0000422EXPORT_SYMBOL(ip_queue_xmit);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700423
424
425static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
426{
427 to->pkt_type = from->pkt_type;
428 to->priority = from->priority;
429 to->protocol = from->protocol;
Eric Dumazetadf30902009-06-02 05:19:30 +0000430 skb_dst_drop(to);
Eric Dumazetfe76cda2010-07-01 23:48:22 +0000431 skb_dst_copy(to, from);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700432 to->dev = from->dev;
Thomas Graf82e91ff2006-11-09 15:19:14 -0800433 to->mark = from->mark;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700434
435 /* Copy the flags to each fragment. */
436 IPCB(to)->flags = IPCB(from)->flags;
437
438#ifdef CONFIG_NET_SCHED
439 to->tc_index = from->tc_index;
440#endif
Yasuyuki Kozakaie7ac05f2007-03-14 16:44:01 -0700441 nf_copy(to, from);
Jozsef Kadlecsikba9dda32007-07-07 22:21:23 -0700442#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
443 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
444 to->nf_trace = from->nf_trace;
445#endif
Julian Anastasovc98d80e2005-10-22 13:39:21 +0300446#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
447 to->ipvs_property = from->ipvs_property;
448#endif
James Morris984bc162006-06-09 00:29:17 -0700449 skb_copy_secmark(to, from);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700450}
451
452/*
453 * This IP datagram is too large to be sent in one piece. Break it up into
454 * smaller pieces (each of size equal to IP header plus
455 * a block of the data of the original IP data part) that will yet fit in a
456 * single device frame, and queue such a frame for sending.
457 */
458
Jianjun Kongd93191002008-11-03 00:23:42 -0800459int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700460{
461 struct iphdr *iph;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700462 int ptr;
463 struct net_device *dev;
464 struct sk_buff *skb2;
Changli Gaoc893b802010-07-31 13:25:08 +0000465 unsigned int mtu, hlen, left, len, ll_rs;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700466 int offset;
Alexey Dobriyan76ab6082006-01-06 13:24:29 -0800467 __be16 not_last_frag;
Eric Dumazet511c3f92009-06-02 05:14:27 +0000468 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700469 int err = 0;
470
Changli Gaod8d1f302010-06-10 23:31:35 -0700471 dev = rt->dst.dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700472
473 /*
474 * Point into the IP datagram header.
475 */
476
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700477 iph = ip_hdr(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700478
479 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700480 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700481 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
John Heffner628a5c52007-04-20 15:53:27 -0700482 htonl(ip_skb_dst_mtu(skb)));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700483 kfree_skb(skb);
484 return -EMSGSIZE;
485 }
486
487 /*
488 * Setup starting values.
489 */
490
491 hlen = iph->ihl * 4;
Changli Gaod8d1f302010-06-10 23:31:35 -0700492 mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
Bart De Schuymer6c79bf02010-04-20 16:22:01 +0200493#ifdef CONFIG_BRIDGE_NETFILTER
494 if (skb->nf_bridge)
495 mtu -= nf_bridge_mtu_reduction(skb);
496#endif
Herbert Xu89cee8b2005-12-13 23:14:27 -0800497 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700498
499 /* When frag_list is given, use it. First, check its validity:
500 * some transformers could create wrong frag_list or break existing
501 * one, it is not prohibited. In this case fall back to copying.
502 *
503 * LATER: this step can be merged to real generation of fragments,
504 * we can switch to copy when see the first bad fragment.
505 */
David S. Miller21dc3302010-08-23 00:13:46 -0700506 if (skb_has_frag_list(skb)) {
Eric Dumazet3d130082010-09-21 08:47:45 +0000507 struct sk_buff *frag, *frag2;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700508 int first_len = skb_pagelen(skb);
509
510 if (first_len - hlen > mtu ||
511 ((first_len - hlen) & 7) ||
Paul Gortmaker56f8a752011-06-21 20:33:34 -0700512 ip_is_fragment(iph) ||
Linus Torvalds1da177e2005-04-16 15:20:36 -0700513 skb_cloned(skb))
514 goto slow_path;
515
David S. Millerd7fcf1a2009-06-09 00:19:37 -0700516 skb_walk_frags(skb, frag) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700517 /* Correct geometry. */
518 if (frag->len > mtu ||
519 ((frag->len & 7) && frag->next) ||
520 skb_headroom(frag) < hlen)
Eric Dumazet3d130082010-09-21 08:47:45 +0000521 goto slow_path_clean;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700522
523 /* Partially cloned skb? */
524 if (skb_shared(frag))
Eric Dumazet3d130082010-09-21 08:47:45 +0000525 goto slow_path_clean;
Herbert Xu2fdba6b2005-05-18 22:52:33 -0700526
527 BUG_ON(frag->sk);
528 if (skb->sk) {
Herbert Xu2fdba6b2005-05-18 22:52:33 -0700529 frag->sk = skb->sk;
530 frag->destructor = sock_wfree;
Herbert Xu2fdba6b2005-05-18 22:52:33 -0700531 }
Eric Dumazet3d130082010-09-21 08:47:45 +0000532 skb->truesize -= frag->truesize;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700533 }
534
535 /* Everything is OK. Generate! */
536
537 err = 0;
538 offset = 0;
539 frag = skb_shinfo(skb)->frag_list;
David S. Millerd7fcf1a2009-06-09 00:19:37 -0700540 skb_frag_list_init(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700541 skb->data_len = first_len - skb_headlen(skb);
542 skb->len = first_len;
543 iph->tot_len = htons(first_len);
544 iph->frag_off = htons(IP_MF);
545 ip_send_check(iph);
546
547 for (;;) {
548 /* Prepare header of the next frame,
549 * before previous one went down. */
550 if (frag) {
551 frag->ip_summed = CHECKSUM_NONE;
Arnaldo Carvalho de Melobadff6d2007-03-13 13:06:52 -0300552 skb_reset_transport_header(frag);
Arnaldo Carvalho de Meloe2d1bca2007-04-10 20:46:21 -0700553 __skb_push(frag, hlen);
554 skb_reset_network_header(frag);
Arnaldo Carvalho de Melod56f90a2007-04-10 20:50:43 -0700555 memcpy(skb_network_header(frag), iph, hlen);
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700556 iph = ip_hdr(frag);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700557 iph->tot_len = htons(frag->len);
558 ip_copy_metadata(frag, skb);
559 if (offset == 0)
560 ip_options_fragment(frag);
561 offset += skb->len - hlen;
562 iph->frag_off = htons(offset>>3);
563 if (frag->next != NULL)
564 iph->frag_off |= htons(IP_MF);
565 /* Ready, complete checksum */
566 ip_send_check(iph);
567 }
568
569 err = output(skb);
570
Wei Dongdafee492006-08-02 13:41:21 -0700571 if (!err)
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700572 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700573 if (err || !frag)
574 break;
575
576 skb = frag;
577 frag = skb->next;
578 skb->next = NULL;
579 }
580
581 if (err == 0) {
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700582 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700583 return 0;
584 }
585
586 while (frag) {
587 skb = frag->next;
588 kfree_skb(frag);
589 frag = skb;
590 }
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700591 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700592 return err;
Eric Dumazet3d130082010-09-21 08:47:45 +0000593
594slow_path_clean:
595 skb_walk_frags(skb, frag2) {
596 if (frag2 == frag)
597 break;
598 frag2->sk = NULL;
599 frag2->destructor = NULL;
600 skb->truesize += frag2->truesize;
601 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700602 }
603
604slow_path:
605 left = skb->len - hlen; /* Space per frame */
George Kadianakis49085bd2010-07-06 11:44:12 +0000606 ptr = hlen; /* Where to start from */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700607
Linus Torvalds1da177e2005-04-16 15:20:36 -0700608 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
Stephen Hemminger9bcfcaf2006-08-29 17:48:57 -0700609 * we need to make room for the encapsulating header
610 */
Changli Gaoc893b802010-07-31 13:25:08 +0000611 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
Stephen Hemminger9bcfcaf2006-08-29 17:48:57 -0700612
Linus Torvalds1da177e2005-04-16 15:20:36 -0700613 /*
614 * Fragment the datagram.
615 */
616
617 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
618 not_last_frag = iph->frag_off & htons(IP_MF);
619
620 /*
621 * Keep copying data until we run out.
622 */
623
Stephen Hemminger132adf52007-03-08 20:44:43 -0800624 while (left > 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700625 len = left;
626 /* IF: it doesn't fit, use 'mtu' - the data space left */
627 if (len > mtu)
628 len = mtu;
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300629 /* IF: we are not sending up to and including the packet end
Linus Torvalds1da177e2005-04-16 15:20:36 -0700630 then align the next start on an eight byte boundary */
631 if (len < left) {
632 len &= ~7;
633 }
634 /*
635 * Allocate buffer.
636 */
637
638 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
Patrick McHardy64ce2072005-08-09 20:50:53 -0700639 NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700640 err = -ENOMEM;
641 goto fail;
642 }
643
644 /*
645 * Set up data on packet
646 */
647
648 ip_copy_metadata(skb2, skb);
649 skb_reserve(skb2, ll_rs);
650 skb_put(skb2, len + hlen);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -0700651 skb_reset_network_header(skb2);
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -0700652 skb2->transport_header = skb2->network_header + hlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700653
654 /*
655 * Charge the memory for the fragment to any owner
656 * it might possess
657 */
658
659 if (skb->sk)
660 skb_set_owner_w(skb2, skb->sk);
661
662 /*
663 * Copy the packet header into the new buffer.
664 */
665
Arnaldo Carvalho de Melod626f622007-03-27 18:55:52 -0300666 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700667
668 /*
669 * Copy a block of the IP datagram.
670 */
Arnaldo Carvalho de Melobff9b612007-03-16 17:19:57 -0300671 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700672 BUG();
673 left -= len;
674
675 /*
676 * Fill in the new header fields.
677 */
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -0700678 iph = ip_hdr(skb2);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700679 iph->frag_off = htons((offset >> 3));
680
681 /* ANK: dirty, but effective trick. Upgrade options only if
682 * the segment to be fragmented was THE FIRST (otherwise,
683 * options are already fixed) and make it ONCE
684 * on the initial skb, so that all the following fragments
685 * will inherit fixed options.
686 */
687 if (offset == 0)
688 ip_options_fragment(skb);
689
690 /*
691 * Added AC : If we are fragmenting a fragment that's not the
692 * last fragment then keep MF on each bit
693 */
694 if (left > 0 || not_last_frag)
695 iph->frag_off |= htons(IP_MF);
696 ptr += len;
697 offset += len;
698
699 /*
700 * Put this fragment into the sending queue.
701 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700702 iph->tot_len = htons(len + hlen);
703
704 ip_send_check(iph);
705
706 err = output(skb2);
707 if (err)
708 goto fail;
Wei Dongdafee492006-08-02 13:41:21 -0700709
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700710 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700711 }
Eric Dumazet5d0ba552012-06-04 01:17:19 +0000712 consume_skb(skb);
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700713 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700714 return err;
715
716fail:
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900717 kfree_skb(skb);
Pavel Emelyanov5e38e272008-07-16 20:19:49 -0700718 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700719 return err;
720}
Patrick McHardy2e2f7ae2006-04-04 13:42:35 -0700721EXPORT_SYMBOL(ip_fragment);
722
Linus Torvalds1da177e2005-04-16 15:20:36 -0700723int
724ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
725{
726 struct iovec *iov = from;
727
Patrick McHardy84fa7932006-08-29 16:44:56 -0700728 if (skb->ip_summed == CHECKSUM_PARTIAL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700729 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
730 return -EFAULT;
731 } else {
Al Viro44bb9362006-11-14 21:36:14 -0800732 __wsum csum = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700733 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
734 return -EFAULT;
735 skb->csum = csum_block_add(skb->csum, csum, odd);
736 }
737 return 0;
738}
Eric Dumazet4bc2f182010-07-09 21:22:10 +0000739EXPORT_SYMBOL(ip_generic_getfrag);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700740
Al Viro44bb9362006-11-14 21:36:14 -0800741static inline __wsum
Linus Torvalds1da177e2005-04-16 15:20:36 -0700742csum_page(struct page *page, int offset, int copy)
743{
744 char *kaddr;
Al Viro44bb9362006-11-14 21:36:14 -0800745 __wsum csum;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700746 kaddr = kmap(page);
747 csum = csum_partial(kaddr + offset, copy, 0);
748 kunmap(page);
749 return csum;
750}
751
Adrian Bunk4b30b1c2005-11-29 16:27:20 -0800752static inline int ip_ufo_append_data(struct sock *sk,
Herbert Xu1470ddf2011-03-01 02:36:47 +0000753 struct sk_buff_head *queue,
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700754 int getfrag(void *from, char *to, int offset, int len,
755 int odd, struct sk_buff *skb),
756 void *from, int length, int hh_len, int fragheaderlen,
Bill Sommerfeldd9be4f72011-07-19 15:22:33 +0000757 int transhdrlen, int maxfraglen, unsigned int flags)
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700758{
759 struct sk_buff *skb;
760 int err;
761
762 /* There is support for UDP fragmentation offload by network
763 * device, so create one single skb packet containing complete
764 * udp datagram
765 */
Herbert Xu1470ddf2011-03-01 02:36:47 +0000766 if ((skb = skb_peek_tail(queue)) == NULL) {
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700767 skb = sock_alloc_send_skb(sk,
768 hh_len + fragheaderlen + transhdrlen + 20,
769 (flags & MSG_DONTWAIT), &err);
770
771 if (skb == NULL)
772 return err;
773
774 /* reserve space for Hardware header */
775 skb_reserve(skb, hh_len);
776
777 /* create space for UDP/IP header */
Jianjun Kongd93191002008-11-03 00:23:42 -0800778 skb_put(skb, fragheaderlen + transhdrlen);
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700779
780 /* initialize network header pointer */
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -0700781 skb_reset_network_header(skb);
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700782
783 /* initialize protocol header pointer */
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -0700784 skb->transport_header = skb->network_header + fragheaderlen;
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700785
Patrick McHardy84fa7932006-08-29 16:44:56 -0700786 skb->ip_summed = CHECKSUM_PARTIAL;
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700787 skb->csum = 0;
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700788
Kostya Bbe9164e2008-04-29 22:36:30 -0700789 /* specify the length of each IP datagram fragment */
Bill Sommerfeldd9be4f72011-07-19 15:22:33 +0000790 skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
Herbert Xuf83ef8c2006-06-30 13:37:03 -0700791 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
Herbert Xu1470ddf2011-03-01 02:36:47 +0000792 __skb_queue_tail(queue, skb);
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700793 }
Kostya Bbe9164e2008-04-29 22:36:30 -0700794
795 return skb_append_datato_frags(sk, skb, getfrag, from,
796 (length - transhdrlen));
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700797}
798
David S. Millerf5fca602011-05-08 17:24:10 -0700799static int __ip_append_data(struct sock *sk,
800 struct flowi4 *fl4,
801 struct sk_buff_head *queue,
Herbert Xu1470ddf2011-03-01 02:36:47 +0000802 struct inet_cork *cork,
803 int getfrag(void *from, char *to, int offset,
804 int len, int odd, struct sk_buff *skb),
805 void *from, int length, int transhdrlen,
806 unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700807{
808 struct inet_sock *inet = inet_sk(sk);
809 struct sk_buff *skb;
810
Herbert Xu07df5292011-03-01 23:00:58 -0800811 struct ip_options *opt = cork->opt;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700812 int hh_len;
813 int exthdrlen;
814 int mtu;
815 int copy;
816 int err;
817 int offset = 0;
818 unsigned int maxfraglen, fragheaderlen;
819 int csummode = CHECKSUM_NONE;
Herbert Xu1470ddf2011-03-01 02:36:47 +0000820 struct rtable *rt = (struct rtable *)cork->dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700821
Steffen Klassert96d73032011-06-05 20:48:47 +0000822 skb = skb_peek_tail(queue);
823
824 exthdrlen = !skb ? rt->dst.header_len : 0;
Herbert Xu07df5292011-03-01 23:00:58 -0800825 mtu = cork->fragsize;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700826
Changli Gaod8d1f302010-06-10 23:31:35 -0700827 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700828
829 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
830 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
831
Herbert Xu1470ddf2011-03-01 02:36:47 +0000832 if (cork->length + length > 0xFFFF - fragheaderlen) {
David S. Millerf5fca602011-05-08 17:24:10 -0700833 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
Eric Dumazetc720c7e2009-10-15 06:30:45 +0000834 mtu-exthdrlen);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700835 return -EMSGSIZE;
836 }
837
838 /*
839 * transhdrlen > 0 means that this is the first fragment and we wish
840 * it won't be fragmented in the future.
841 */
842 if (transhdrlen &&
843 length + fragheaderlen <= mtu &&
Changli Gaod8d1f302010-06-10 23:31:35 -0700844 rt->dst.dev->features & NETIF_F_V4_CSUM &&
Linus Torvalds1da177e2005-04-16 15:20:36 -0700845 !exthdrlen)
Patrick McHardy84fa7932006-08-29 16:44:56 -0700846 csummode = CHECKSUM_PARTIAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700847
Herbert Xu1470ddf2011-03-01 02:36:47 +0000848 cork->length += length;
Herbert Xu26cde9f2010-06-15 01:52:25 +0000849 if (((length > mtu) || (skb && skb_is_gso(skb))) &&
Kostya Bbe9164e2008-04-29 22:36:30 -0700850 (sk->sk_protocol == IPPROTO_UDP) &&
Steffen Klassertc1460662011-06-29 23:19:32 +0000851 (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
Herbert Xu1470ddf2011-03-01 02:36:47 +0000852 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
853 hh_len, fragheaderlen, transhdrlen,
Bill Sommerfeldd9be4f72011-07-19 15:22:33 +0000854 maxfraglen, flags);
Patrick McHardybaa829d2006-03-12 20:35:12 -0800855 if (err)
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700856 goto error;
Ananda Rajue89e9cf2005-10-18 15:46:41 -0700857 return 0;
858 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700859
860 /* So, what's going on in the loop below?
861 *
862 * We use calculated fragment length to generate chained skb,
863 * each of segments is IP fragment ready for sending to network after
864 * adding appropriate IP header.
865 */
866
Herbert Xu26cde9f2010-06-15 01:52:25 +0000867 if (!skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700868 goto alloc_new_skb;
869
870 while (length > 0) {
871 /* Check if the remaining data fits into current packet. */
872 copy = mtu - skb->len;
873 if (copy < length)
874 copy = maxfraglen - skb->len;
875 if (copy <= 0) {
876 char *data;
877 unsigned int datalen;
878 unsigned int fraglen;
879 unsigned int fraggap;
880 unsigned int alloclen;
881 struct sk_buff *skb_prev;
882alloc_new_skb:
883 skb_prev = skb;
884 if (skb_prev)
885 fraggap = skb_prev->len - maxfraglen;
886 else
887 fraggap = 0;
888
889 /*
890 * If remaining data exceeds the mtu,
891 * we know we need more fragment(s).
892 */
893 datalen = length + fraggap;
894 if (datalen > mtu - fragheaderlen)
895 datalen = maxfraglen - fragheaderlen;
896 fraglen = datalen + fragheaderlen;
897
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900898 if ((flags & MSG_MORE) &&
Changli Gaod8d1f302010-06-10 23:31:35 -0700899 !(rt->dst.dev->features&NETIF_F_SG))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700900 alloclen = mtu;
901 else
Eric Dumazet59104f02010-09-20 20:16:27 +0000902 alloclen = fraglen;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700903
Steffen Klassert353e5c92011-06-22 01:05:37 +0000904 alloclen += exthdrlen;
905
Linus Torvalds1da177e2005-04-16 15:20:36 -0700906 /* The last fragment gets additional space at tail.
907 * Note, with MSG_MORE we overallocate on fragments,
908 * because we have no idea what fragment will be
909 * the last.
910 */
Steffen Klassert33f99dc2011-06-22 01:04:37 +0000911 if (datalen == length + fraggap)
Changli Gaod8d1f302010-06-10 23:31:35 -0700912 alloclen += rt->dst.trailer_len;
Steffen Klassert33f99dc2011-06-22 01:04:37 +0000913
Linus Torvalds1da177e2005-04-16 15:20:36 -0700914 if (transhdrlen) {
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900915 skb = sock_alloc_send_skb(sk,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700916 alloclen + hh_len + 15,
917 (flags & MSG_DONTWAIT), &err);
918 } else {
919 skb = NULL;
920 if (atomic_read(&sk->sk_wmem_alloc) <=
921 2 * sk->sk_sndbuf)
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900922 skb = sock_wmalloc(sk,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700923 alloclen + hh_len + 15, 1,
924 sk->sk_allocation);
925 if (unlikely(skb == NULL))
926 err = -ENOBUFS;
Patrick Ohly51f31ca2009-02-12 05:03:39 +0000927 else
928 /* only the initial fragment is
929 time stamped */
Herbert Xu1470ddf2011-03-01 02:36:47 +0000930 cork->tx_flags = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700931 }
932 if (skb == NULL)
933 goto error;
934
935 /*
936 * Fill in the control structures
937 */
938 skb->ip_summed = csummode;
939 skb->csum = 0;
940 skb_reserve(skb, hh_len);
Herbert Xu1470ddf2011-03-01 02:36:47 +0000941 skb_shinfo(skb)->tx_flags = cork->tx_flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700942
943 /*
944 * Find where to start putting bytes.
945 */
Steffen Klassert353e5c92011-06-22 01:05:37 +0000946 data = skb_put(skb, fraglen + exthdrlen);
Arnaldo Carvalho de Meloc14d2452007-03-11 22:39:41 -0300947 skb_set_network_header(skb, exthdrlen);
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -0700948 skb->transport_header = (skb->network_header +
949 fragheaderlen);
Steffen Klassert353e5c92011-06-22 01:05:37 +0000950 data += fragheaderlen + exthdrlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700951
952 if (fraggap) {
953 skb->csum = skb_copy_and_csum_bits(
954 skb_prev, maxfraglen,
955 data + transhdrlen, fraggap, 0);
956 skb_prev->csum = csum_sub(skb_prev->csum,
957 skb->csum);
958 data += fraggap;
Herbert Xue9fa4f72006-08-13 20:12:58 -0700959 pskb_trim_unique(skb_prev, maxfraglen);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700960 }
961
962 copy = datalen - transhdrlen - fraggap;
963 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
964 err = -EFAULT;
965 kfree_skb(skb);
966 goto error;
967 }
968
969 offset += copy;
970 length -= datalen - fraggap;
971 transhdrlen = 0;
972 exthdrlen = 0;
973 csummode = CHECKSUM_NONE;
974
975 /*
976 * Put the packet on the pending queue.
977 */
Herbert Xu1470ddf2011-03-01 02:36:47 +0000978 __skb_queue_tail(queue, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700979 continue;
980 }
981
982 if (copy > length)
983 copy = length;
984
Changli Gaod8d1f302010-06-10 23:31:35 -0700985 if (!(rt->dst.dev->features&NETIF_F_SG)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700986 unsigned int off;
987
988 off = skb->len;
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +0900989 if (getfrag(from, skb_put(skb, copy),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700990 offset, copy, off, skb) < 0) {
991 __skb_trim(skb, off);
992 err = -EFAULT;
993 goto error;
994 }
995 } else {
996 int i = skb_shinfo(skb)->nr_frags;
997 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
Herbert Xu1470ddf2011-03-01 02:36:47 +0000998 struct page *page = cork->page;
999 int off = cork->off;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001000 unsigned int left;
1001
1002 if (page && (left = PAGE_SIZE - off) > 0) {
1003 if (copy >= left)
1004 copy = left;
Ian Campbellaff65da2011-08-22 23:44:59 +00001005 if (page != skb_frag_page(frag)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001006 if (i == MAX_SKB_FRAGS) {
1007 err = -EMSGSIZE;
1008 goto error;
1009 }
Herbert Xu1470ddf2011-03-01 02:36:47 +00001010 skb_fill_page_desc(skb, i, page, off, 0);
Ian Campbellaff65da2011-08-22 23:44:59 +00001011 skb_frag_ref(skb, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001012 frag = &skb_shinfo(skb)->frags[i];
1013 }
1014 } else if (i < MAX_SKB_FRAGS) {
1015 if (copy > PAGE_SIZE)
1016 copy = PAGE_SIZE;
1017 page = alloc_pages(sk->sk_allocation, 0);
1018 if (page == NULL) {
1019 err = -ENOMEM;
1020 goto error;
1021 }
Herbert Xu1470ddf2011-03-01 02:36:47 +00001022 cork->page = page;
1023 cork->off = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001024
1025 skb_fill_page_desc(skb, i, page, 0, 0);
1026 frag = &skb_shinfo(skb)->frags[i];
Linus Torvalds1da177e2005-04-16 15:20:36 -07001027 } else {
1028 err = -EMSGSIZE;
1029 goto error;
1030 }
Eric Dumazet9e903e02011-10-18 21:00:24 +00001031 if (getfrag(from, skb_frag_address(frag)+skb_frag_size(frag),
Ian Campbellaff65da2011-08-22 23:44:59 +00001032 offset, copy, skb->len, skb) < 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001033 err = -EFAULT;
1034 goto error;
1035 }
Herbert Xu1470ddf2011-03-01 02:36:47 +00001036 cork->off += copy;
Eric Dumazet9e903e02011-10-18 21:00:24 +00001037 skb_frag_size_add(frag, copy);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001038 skb->len += copy;
1039 skb->data_len += copy;
Herbert Xuf945fa72008-01-22 22:39:26 -08001040 skb->truesize += copy;
1041 atomic_add(copy, &sk->sk_wmem_alloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001042 }
1043 offset += copy;
1044 length -= copy;
1045 }
1046
1047 return 0;
1048
1049error:
Herbert Xu1470ddf2011-03-01 02:36:47 +00001050 cork->length -= length;
Pavel Emelyanov5e38e272008-07-16 20:19:49 -07001051 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001052 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001053}
1054
Herbert Xu1470ddf2011-03-01 02:36:47 +00001055static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1056 struct ipcm_cookie *ipc, struct rtable **rtp)
1057{
1058 struct inet_sock *inet = inet_sk(sk);
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001059 struct ip_options_rcu *opt;
Herbert Xu1470ddf2011-03-01 02:36:47 +00001060 struct rtable *rt;
1061
1062 /*
1063 * setup for corking.
1064 */
1065 opt = ipc->opt;
1066 if (opt) {
1067 if (cork->opt == NULL) {
1068 cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1069 sk->sk_allocation);
1070 if (unlikely(cork->opt == NULL))
1071 return -ENOBUFS;
1072 }
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001073 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
Herbert Xu1470ddf2011-03-01 02:36:47 +00001074 cork->flags |= IPCORK_OPT;
1075 cork->addr = ipc->addr;
1076 }
1077 rt = *rtp;
1078 if (unlikely(!rt))
1079 return -EFAULT;
1080 /*
1081 * We steal reference to this route, caller should not release it
1082 */
1083 *rtp = NULL;
1084 cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
Steffen Klassert353e5c92011-06-22 01:05:37 +00001085 rt->dst.dev->mtu : dst_mtu(&rt->dst);
Herbert Xu1470ddf2011-03-01 02:36:47 +00001086 cork->dst = &rt->dst;
1087 cork->length = 0;
1088 cork->tx_flags = ipc->tx_flags;
1089 cork->page = NULL;
1090 cork->off = 0;
1091
1092 return 0;
1093}
1094
1095/*
1096 * ip_append_data() and ip_append_page() can make one large IP datagram
1097 * from many pieces of data. Each pieces will be holded on the socket
1098 * until ip_push_pending_frames() is called. Each piece can be a page
1099 * or non-page data.
1100 *
1101 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1102 * this interface potentially.
1103 *
1104 * LATER: length must be adjusted by pad at tail, when it is required.
1105 */
David S. Millerf5fca602011-05-08 17:24:10 -07001106int ip_append_data(struct sock *sk, struct flowi4 *fl4,
Herbert Xu1470ddf2011-03-01 02:36:47 +00001107 int getfrag(void *from, char *to, int offset, int len,
1108 int odd, struct sk_buff *skb),
1109 void *from, int length, int transhdrlen,
1110 struct ipcm_cookie *ipc, struct rtable **rtp,
1111 unsigned int flags)
1112{
1113 struct inet_sock *inet = inet_sk(sk);
1114 int err;
1115
1116 if (flags&MSG_PROBE)
1117 return 0;
1118
1119 if (skb_queue_empty(&sk->sk_write_queue)) {
David S. Millerbdc712b2011-05-06 15:02:07 -07001120 err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
Herbert Xu1470ddf2011-03-01 02:36:47 +00001121 if (err)
1122 return err;
1123 } else {
1124 transhdrlen = 0;
1125 }
1126
David S. Millerf5fca602011-05-08 17:24:10 -07001127 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
Herbert Xu1470ddf2011-03-01 02:36:47 +00001128 from, length, transhdrlen, flags);
1129}
1130
David S. Millerf5fca602011-05-08 17:24:10 -07001131ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001132 int offset, size_t size, int flags)
1133{
1134 struct inet_sock *inet = inet_sk(sk);
1135 struct sk_buff *skb;
1136 struct rtable *rt;
1137 struct ip_options *opt = NULL;
David S. Millerbdc712b2011-05-06 15:02:07 -07001138 struct inet_cork *cork;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001139 int hh_len;
1140 int mtu;
1141 int len;
1142 int err;
1143 unsigned int maxfraglen, fragheaderlen, fraggap;
1144
1145 if (inet->hdrincl)
1146 return -EPERM;
1147
1148 if (flags&MSG_PROBE)
1149 return 0;
1150
1151 if (skb_queue_empty(&sk->sk_write_queue))
1152 return -EINVAL;
1153
David S. Millerbdc712b2011-05-06 15:02:07 -07001154 cork = &inet->cork.base;
1155 rt = (struct rtable *)cork->dst;
1156 if (cork->flags & IPCORK_OPT)
1157 opt = cork->opt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001158
Changli Gaod8d1f302010-06-10 23:31:35 -07001159 if (!(rt->dst.dev->features&NETIF_F_SG))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001160 return -EOPNOTSUPP;
1161
Changli Gaod8d1f302010-06-10 23:31:35 -07001162 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
David S. Millerbdc712b2011-05-06 15:02:07 -07001163 mtu = cork->fragsize;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001164
1165 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1166 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1167
David S. Millerbdc712b2011-05-06 15:02:07 -07001168 if (cork->length + size > 0xFFFF - fragheaderlen) {
David S. Millerf5fca602011-05-08 17:24:10 -07001169 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001170 return -EMSGSIZE;
1171 }
1172
1173 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1174 return -EINVAL;
1175
David S. Millerbdc712b2011-05-06 15:02:07 -07001176 cork->length += size;
Herbert Xu26cde9f2010-06-15 01:52:25 +00001177 if ((size + skb->len > mtu) &&
1178 (sk->sk_protocol == IPPROTO_UDP) &&
Changli Gaod8d1f302010-06-10 23:31:35 -07001179 (rt->dst.dev->features & NETIF_F_UFO)) {
Herbert Xu79671682006-06-22 02:40:14 -07001180 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
Herbert Xuf83ef8c2006-06-30 13:37:03 -07001181 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
Herbert Xu79671682006-06-22 02:40:14 -07001182 }
Ananda Rajue89e9cf2005-10-18 15:46:41 -07001183
Linus Torvalds1da177e2005-04-16 15:20:36 -07001184
1185 while (size > 0) {
1186 int i;
1187
Herbert Xu89114af2006-07-08 13:34:32 -07001188 if (skb_is_gso(skb))
Ananda Rajue89e9cf2005-10-18 15:46:41 -07001189 len = size;
1190 else {
1191
1192 /* Check if the remaining data fits into current packet. */
1193 len = mtu - skb->len;
1194 if (len < size)
1195 len = maxfraglen - skb->len;
1196 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001197 if (len <= 0) {
1198 struct sk_buff *skb_prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001199 int alloclen;
1200
1201 skb_prev = skb;
Jayachandran C0d0d2bb2005-10-13 11:43:02 -07001202 fraggap = skb_prev->len - maxfraglen;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001203
1204 alloclen = fragheaderlen + hh_len + fraggap + 15;
1205 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1206 if (unlikely(!skb)) {
1207 err = -ENOBUFS;
1208 goto error;
1209 }
1210
1211 /*
1212 * Fill in the control structures
1213 */
1214 skb->ip_summed = CHECKSUM_NONE;
1215 skb->csum = 0;
1216 skb_reserve(skb, hh_len);
1217
1218 /*
1219 * Find where to start putting bytes.
1220 */
Arnaldo Carvalho de Melo967b05f2007-03-13 13:51:52 -03001221 skb_put(skb, fragheaderlen + fraggap);
Arnaldo Carvalho de Melo2ca9e6f2007-03-10 19:15:25 -03001222 skb_reset_network_header(skb);
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -07001223 skb->transport_header = (skb->network_header +
1224 fragheaderlen);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001225 if (fraggap) {
Arnaldo Carvalho de Melo967b05f2007-03-13 13:51:52 -03001226 skb->csum = skb_copy_and_csum_bits(skb_prev,
1227 maxfraglen,
Arnaldo Carvalho de Melo9c702202007-04-25 18:04:18 -07001228 skb_transport_header(skb),
Arnaldo Carvalho de Melo967b05f2007-03-13 13:51:52 -03001229 fraggap, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001230 skb_prev->csum = csum_sub(skb_prev->csum,
1231 skb->csum);
Herbert Xue9fa4f72006-08-13 20:12:58 -07001232 pskb_trim_unique(skb_prev, maxfraglen);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001233 }
1234
1235 /*
1236 * Put the packet on the pending queue.
1237 */
1238 __skb_queue_tail(&sk->sk_write_queue, skb);
1239 continue;
1240 }
1241
1242 i = skb_shinfo(skb)->nr_frags;
1243 if (len > size)
1244 len = size;
1245 if (skb_can_coalesce(skb, i, page, offset)) {
Eric Dumazet9e903e02011-10-18 21:00:24 +00001246 skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001247 } else if (i < MAX_SKB_FRAGS) {
1248 get_page(page);
1249 skb_fill_page_desc(skb, i, page, offset, len);
1250 } else {
1251 err = -EMSGSIZE;
1252 goto error;
1253 }
1254
1255 if (skb->ip_summed == CHECKSUM_NONE) {
Al Viro44bb9362006-11-14 21:36:14 -08001256 __wsum csum;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001257 csum = csum_page(page, offset, len);
1258 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1259 }
1260
1261 skb->len += len;
1262 skb->data_len += len;
David S. Miller1e34a112008-01-22 23:44:31 -08001263 skb->truesize += len;
1264 atomic_add(len, &sk->sk_wmem_alloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001265 offset += len;
1266 size -= len;
1267 }
1268 return 0;
1269
1270error:
David S. Millerbdc712b2011-05-06 15:02:07 -07001271 cork->length -= size;
Pavel Emelyanov5e38e272008-07-16 20:19:49 -07001272 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001273 return err;
1274}
1275
Herbert Xu1470ddf2011-03-01 02:36:47 +00001276static void ip_cork_release(struct inet_cork *cork)
Pavel Emelyanov429f08e2007-11-05 21:03:24 -08001277{
Herbert Xu1470ddf2011-03-01 02:36:47 +00001278 cork->flags &= ~IPCORK_OPT;
1279 kfree(cork->opt);
1280 cork->opt = NULL;
1281 dst_release(cork->dst);
1282 cork->dst = NULL;
Pavel Emelyanov429f08e2007-11-05 21:03:24 -08001283}
1284
Linus Torvalds1da177e2005-04-16 15:20:36 -07001285/*
1286 * Combined all pending IP fragments on the socket as one IP datagram
1287 * and push them out.
1288 */
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001289struct sk_buff *__ip_make_skb(struct sock *sk,
David S. Miller77968b72011-05-08 17:12:19 -07001290 struct flowi4 *fl4,
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001291 struct sk_buff_head *queue,
1292 struct inet_cork *cork)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001293{
1294 struct sk_buff *skb, *tmp_skb;
1295 struct sk_buff **tail_skb;
1296 struct inet_sock *inet = inet_sk(sk);
Pavel Emelyanov0388b002008-07-14 23:00:43 -07001297 struct net *net = sock_net(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001298 struct ip_options *opt = NULL;
Herbert Xu1470ddf2011-03-01 02:36:47 +00001299 struct rtable *rt = (struct rtable *)cork->dst;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001300 struct iphdr *iph;
Alexey Dobriyan76ab6082006-01-06 13:24:29 -08001301 __be16 df = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001302 __u8 ttl;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001303
Herbert Xu1470ddf2011-03-01 02:36:47 +00001304 if ((skb = __skb_dequeue(queue)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001305 goto out;
1306 tail_skb = &(skb_shinfo(skb)->frag_list);
1307
1308 /* move skb->data to ip header from ext header */
Arnaldo Carvalho de Melod56f90a2007-04-10 20:50:43 -07001309 if (skb->data < skb_network_header(skb))
Arnaldo Carvalho de Melobbe735e2007-03-10 22:16:10 -03001310 __skb_pull(skb, skb_network_offset(skb));
Herbert Xu1470ddf2011-03-01 02:36:47 +00001311 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
Arnaldo Carvalho de Melocfe1fc72007-03-16 17:26:39 -03001312 __skb_pull(tmp_skb, skb_network_header_len(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001313 *tail_skb = tmp_skb;
1314 tail_skb = &(tmp_skb->next);
1315 skb->len += tmp_skb->len;
1316 skb->data_len += tmp_skb->len;
1317 skb->truesize += tmp_skb->truesize;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001318 tmp_skb->destructor = NULL;
1319 tmp_skb->sk = NULL;
1320 }
1321
1322 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1323 * to fragment the frame generated here. No matter, what transforms
1324 * how transforms change size of the packet, it will come out.
1325 */
John Heffner628a5c52007-04-20 15:53:27 -07001326 if (inet->pmtudisc < IP_PMTUDISC_DO)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001327 skb->local_df = 1;
1328
1329 /* DF bit is set when we want to see DF on outgoing frames.
1330 * If local_df is set too, we still allow to fragment this frame
1331 * locally. */
John Heffner628a5c52007-04-20 15:53:27 -07001332 if (inet->pmtudisc >= IP_PMTUDISC_DO ||
Changli Gaod8d1f302010-06-10 23:31:35 -07001333 (skb->len <= dst_mtu(&rt->dst) &&
1334 ip_dont_fragment(sk, &rt->dst)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001335 df = htons(IP_DF);
1336
Herbert Xu1470ddf2011-03-01 02:36:47 +00001337 if (cork->flags & IPCORK_OPT)
1338 opt = cork->opt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001339
1340 if (rt->rt_type == RTN_MULTICAST)
1341 ttl = inet->mc_ttl;
1342 else
Changli Gaod8d1f302010-06-10 23:31:35 -07001343 ttl = ip_select_ttl(inet, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001344
1345 iph = (struct iphdr *)skb->data;
1346 iph->version = 4;
1347 iph->ihl = 5;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001348 iph->tos = inet->tos;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001349 iph->frag_off = df;
Changli Gaod8d1f302010-06-10 23:31:35 -07001350 ip_select_ident(iph, &rt->dst, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001351 iph->ttl = ttl;
1352 iph->protocol = sk->sk_protocol;
Eric Dumazet84f93072011-11-30 19:00:53 +00001353 ip_copy_addrs(iph, fl4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001354
David S. Miller22f728f2011-05-13 17:21:27 -04001355 if (opt) {
1356 iph->ihl += opt->optlen>>2;
1357 ip_options_build(skb, opt, cork->addr, rt, 0);
1358 }
1359
Linus Torvalds1da177e2005-04-16 15:20:36 -07001360 skb->priority = sk->sk_priority;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -08001361 skb->mark = sk->sk_mark;
Eric Dumazeta21bba92008-11-24 16:07:50 -08001362 /*
1363 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1364 * on dst refcount
1365 */
Herbert Xu1470ddf2011-03-01 02:36:47 +00001366 cork->dst = NULL;
Changli Gaod8d1f302010-06-10 23:31:35 -07001367 skb_dst_set(skb, &rt->dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001368
David L Stevens96793b42007-09-17 09:57:33 -07001369 if (iph->protocol == IPPROTO_ICMP)
Pavel Emelyanov0388b002008-07-14 23:00:43 -07001370 icmp_out_count(net, ((struct icmphdr *)
David L Stevens96793b42007-09-17 09:57:33 -07001371 skb_transport_header(skb))->type);
1372
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001373 ip_cork_release(cork);
1374out:
1375 return skb;
1376}
1377
1378int ip_send_skb(struct sk_buff *skb)
1379{
1380 struct net *net = sock_net(skb->sk);
1381 int err;
1382
Herbert Xuc439cb22008-01-11 19:14:00 -08001383 err = ip_local_out(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001384 if (err) {
1385 if (err > 0)
Eric Dumazet6ce9e7b2009-09-02 18:05:33 -07001386 err = net_xmit_errno(err);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001387 if (err)
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001388 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001389 }
1390
Linus Torvalds1da177e2005-04-16 15:20:36 -07001391 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001392}
1393
David S. Miller77968b72011-05-08 17:12:19 -07001394int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
Herbert Xu1470ddf2011-03-01 02:36:47 +00001395{
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001396 struct sk_buff *skb;
1397
David S. Miller77968b72011-05-08 17:12:19 -07001398 skb = ip_finish_skb(sk, fl4);
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001399 if (!skb)
1400 return 0;
1401
1402 /* Netfilter gets whole the not fragmented skb. */
1403 return ip_send_skb(skb);
Herbert Xu1470ddf2011-03-01 02:36:47 +00001404}
1405
Linus Torvalds1da177e2005-04-16 15:20:36 -07001406/*
1407 * Throw away all pending data on the socket.
1408 */
Herbert Xu1470ddf2011-03-01 02:36:47 +00001409static void __ip_flush_pending_frames(struct sock *sk,
1410 struct sk_buff_head *queue,
1411 struct inet_cork *cork)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001412{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001413 struct sk_buff *skb;
1414
Herbert Xu1470ddf2011-03-01 02:36:47 +00001415 while ((skb = __skb_dequeue_tail(queue)) != NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001416 kfree_skb(skb);
1417
Herbert Xu1470ddf2011-03-01 02:36:47 +00001418 ip_cork_release(cork);
1419}
1420
1421void ip_flush_pending_frames(struct sock *sk)
1422{
David S. Millerbdc712b2011-05-06 15:02:07 -07001423 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001424}
1425
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001426struct sk_buff *ip_make_skb(struct sock *sk,
David S. Miller77968b72011-05-08 17:12:19 -07001427 struct flowi4 *fl4,
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001428 int getfrag(void *from, char *to, int offset,
1429 int len, int odd, struct sk_buff *skb),
1430 void *from, int length, int transhdrlen,
1431 struct ipcm_cookie *ipc, struct rtable **rtp,
1432 unsigned int flags)
1433{
David S. Millerb80d7222011-05-06 15:06:01 -07001434 struct inet_cork cork;
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001435 struct sk_buff_head queue;
1436 int err;
1437
1438 if (flags & MSG_PROBE)
1439 return NULL;
1440
1441 __skb_queue_head_init(&queue);
1442
David S. Millerb80d7222011-05-06 15:06:01 -07001443 cork.flags = 0;
1444 cork.addr = 0;
David S. Miller70652722011-05-06 16:01:15 -07001445 cork.opt = NULL;
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001446 err = ip_setup_cork(sk, &cork, ipc, rtp);
1447 if (err)
1448 return ERR_PTR(err);
1449
David S. Millerf5fca602011-05-08 17:24:10 -07001450 err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001451 from, length, transhdrlen, flags);
1452 if (err) {
1453 __ip_flush_pending_frames(sk, &queue, &cork);
1454 return ERR_PTR(err);
1455 }
1456
David S. Miller77968b72011-05-08 17:12:19 -07001457 return __ip_make_skb(sk, fl4, &queue, &cork);
Herbert Xu1c32c5a2011-03-01 02:36:47 +00001458}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001459
1460/*
1461 * Fetch data from kernel space and fill in checksum if needed.
1462 */
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001463static int ip_reply_glue_bits(void *dptr, char *to, int offset,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001464 int len, int odd, struct sk_buff *skb)
1465{
Al Viro50842052006-11-14 21:36:34 -08001466 __wsum csum;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001467
1468 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1469 skb->csum = csum_block_add(skb->csum, csum, odd);
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001470 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001471}
1472
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001473/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001474 * Generic function to send a packet as reply to another packet.
1475 * Used to send TCP resets so far. ICMP should use this function too.
1476 *
YOSHIFUJI Hideakie905a9e2007-02-09 23:24:47 +09001477 * Should run single threaded per socket because it uses the sock
Linus Torvalds1da177e2005-04-16 15:20:36 -07001478 * structure to pass arguments.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001479 */
David S. Miller0a5ebb82011-05-09 13:22:43 -07001480void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
Eric Dumazet66b13d92011-10-24 03:06:21 -04001481 const struct ip_reply_arg *arg, unsigned int len)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001482{
1483 struct inet_sock *inet = inet_sk(sk);
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001484 struct ip_options_data replyopts;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001485 struct ipcm_cookie ipc;
David S. Miller77968b72011-05-08 17:12:19 -07001486 struct flowi4 fl4;
Eric Dumazet511c3f92009-06-02 05:14:27 +00001487 struct rtable *rt = skb_rtable(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001488
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001489 if (ip_options_echo(&replyopts.opt.opt, skb))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001490 return;
1491
David S. Miller0a5ebb82011-05-09 13:22:43 -07001492 ipc.addr = daddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001493 ipc.opt = NULL;
Oliver Hartkopp2244d072010-08-17 08:59:14 +00001494 ipc.tx_flags = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001495
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001496 if (replyopts.opt.opt.optlen) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001497 ipc.opt = &replyopts.opt;
1498
Eric Dumazetf6d8bd02011-04-21 09:45:37 +00001499 if (replyopts.opt.opt.srr)
1500 daddr = replyopts.opt.opt.faddr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001501 }
1502
David S. Miller77968b72011-05-08 17:12:19 -07001503 flowi4_init_output(&fl4, arg->bound_dev_if, 0,
Eric Dumazet66b13d92011-10-24 03:06:21 -04001504 RT_TOS(arg->tos),
David S. Miller77968b72011-05-08 17:12:19 -07001505 RT_SCOPE_UNIVERSE, sk->sk_protocol,
1506 ip_reply_arg_flowi_flags(arg),
1507 daddr, rt->rt_spec_dst,
1508 tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1509 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1510 rt = ip_route_output_key(sock_net(sk), &fl4);
1511 if (IS_ERR(rt))
1512 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001513
1514 /* And let IP do all the hard work.
1515
1516 This chunk is not reenterable, hence spinlock.
1517 Note that it uses the fact, that this function is called
1518 with locally disabled BH and that sk cannot be already spinlocked.
1519 */
1520 bh_lock_sock(sk);
Eric Dumazet66b13d92011-10-24 03:06:21 -04001521 inet->tos = arg->tos;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001522 sk->sk_priority = skb->priority;
Arnaldo Carvalho de Meloeddc9ec2007-04-20 22:47:35 -07001523 sk->sk_protocol = ip_hdr(skb)->protocol;
Patrick McHardyf0e48db2007-06-04 21:32:46 -07001524 sk->sk_bound_dev_if = arg->bound_dev_if;
David S. Millerf5fca602011-05-08 17:24:10 -07001525 ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
Eric Dumazet2e77d892008-11-24 15:52:46 -08001526 &ipc, &rt, MSG_DONTWAIT);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001527 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1528 if (arg->csumoffset >= 0)
Arnaldo Carvalho de Melo9c702202007-04-25 18:04:18 -07001529 *((__sum16 *)skb_transport_header(skb) +
1530 arg->csumoffset) = csum_fold(csum_add(skb->csum,
1531 arg->csum));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001532 skb->ip_summed = CHECKSUM_NONE;
David S. Miller77968b72011-05-08 17:12:19 -07001533 ip_push_pending_frames(sk, &fl4);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001534 }
1535
1536 bh_unlock_sock(sk);
1537
1538 ip_rt_put(rt);
1539}
1540
Linus Torvalds1da177e2005-04-16 15:20:36 -07001541void __init ip_init(void)
1542{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001543 ip_rt_init();
1544 inet_initpeers();
1545
1546#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1547 igmp_mc_proc_init();
1548#endif
1549}