ipv4: add IP_RECVFRAGSIZE cmsg
The IP stack records the largest fragment of a reassembled packet
in IPCB(skb)->frag_max_size. When reading a datagram or raw packet
that arrived fragmented, expose the value to allow applications to
estimate receive path MTU.
Tested:
Sent data over a veth pair of which the source has a small mtu.
Sent data using netcat, received using a dedicated process.
Verified that the cmsg IP_RECVFRAGSIZE is returned only when
data arrives fragmented, and in that cases matches the veth mtu.
ip link add veth0 type veth peer name veth1
ip netns add from
ip netns add to
ip link set dev veth1 netns to
ip netns exec to ip addr add dev veth1 192.168.10.1/24
ip netns exec to ip link set dev veth1 up
ip link set dev veth0 netns from
ip netns exec from ip addr add dev veth0 192.168.10.2/24
ip netns exec from ip link set dev veth0 up
ip netns exec from ip link set dev veth0 mtu 1300
ip netns exec from ethtool -K veth0 ufo off
dd if=/dev/zero bs=1 count=1400 2>/dev/null > payload
ip netns exec to ./recv_cmsg_recvfragsize -4 -u -p 6000 &
ip netns exec from nc -q 1 -u 192.168.10.1 6000 < payload
using github.com/wdebruij/kerneltools/blob/master/tests/recvfragsize.c
Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index b8a2d63..ecbaae2 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -97,6 +97,17 @@
put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data);
}
+static void ip_cmsg_recv_fragsize(struct msghdr *msg, struct sk_buff *skb)
+{
+ int val;
+
+ if (IPCB(skb)->frag_max_size == 0)
+ return;
+
+ val = IPCB(skb)->frag_max_size;
+ put_cmsg(msg, SOL_IP, IP_RECVFRAGSIZE, sizeof(val), &val);
+}
+
static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb,
int tlen, int offset)
{
@@ -218,6 +229,9 @@
if (flags & IP_CMSG_CHECKSUM)
ip_cmsg_recv_checksum(msg, skb, tlen, offset);
+
+ if (flags & IP_CMSG_RECVFRAGSIZE)
+ ip_cmsg_recv_fragsize(msg, skb);
}
EXPORT_SYMBOL(ip_cmsg_recv_offset);
@@ -614,6 +628,7 @@
case IP_MULTICAST_LOOP:
case IP_RECVORIGDSTADDR:
case IP_CHECKSUM:
+ case IP_RECVFRAGSIZE:
if (optlen >= sizeof(int)) {
if (get_user(val, (int __user *) optval))
return -EFAULT;
@@ -726,6 +741,14 @@
}
}
break;
+ case IP_RECVFRAGSIZE:
+ if (sk->sk_type != SOCK_RAW && sk->sk_type != SOCK_DGRAM)
+ goto e_inval;
+ if (val)
+ inet->cmsg_flags |= IP_CMSG_RECVFRAGSIZE;
+ else
+ inet->cmsg_flags &= ~IP_CMSG_RECVFRAGSIZE;
+ break;
case IP_TOS: /* This sets both TOS and Precedence */
if (sk->sk_type == SOCK_STREAM) {
val &= ~INET_ECN_MASK;
@@ -1357,6 +1380,9 @@
case IP_CHECKSUM:
val = (inet->cmsg_flags & IP_CMSG_CHECKSUM) != 0;
break;
+ case IP_RECVFRAGSIZE:
+ val = (inet->cmsg_flags & IP_CMSG_RECVFRAGSIZE) != 0;
+ break;
case IP_TOS:
val = inet->tos;
break;