vxlan: Remote checksum offload

Add support for remote checksum offload in VXLAN. This uses a
reserved bit to indicate that RCO is being done, and uses the low order
reserved eight bits of the VNI to hold the start and offset values in a
compressed manner.

Start is encoded in the low order seven bits of VNI. This is start >> 1
so that the checksum start offset is 0-254 using even values only.
Checksum offset (transport checksum field) is indicated in the high
order bit in the low order byte of the VNI. If the bit is set, the
checksum field is for UDP (so offset = start + 6), else checksum
field is for TCP (so offset = start + 16). Only TCP and UDP are
supported in this implementation.

Remote checksum offload for VXLAN is described in:

https://tools.ietf.org/html/draft-herbert-vxlan-rco-00

Tested by running 200 TCP_STREAM connections with VXLAN (over IPv4).

With UDP checksums and Remote Checksum Offload
  IPv4
      Client
        11.84% CPU utilization
      Server
        12.96% CPU utilization
      9197 Mbps
  IPv6
      Client
        12.46% CPU utilization
      Server
        14.48% CPU utilization
      8963 Mbps

With UDP checksums, no remote checksum offload
  IPv4
      Client
        15.67% CPU utilization
      Server
        14.83% CPU utilization
      9094 Mbps
  IPv6
      Client
        16.21% CPU utilization
      Server
        14.32% CPU utilization
      9058 Mbps

No UDP checksums
  IPv4
      Client
        15.03% CPU utilization
      Server
        23.09% CPU utilization
      9089 Mbps
  IPv6
      Client
        16.18% CPU utilization
      Server
        26.57% CPU utilization
       8954 Mbps

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index a0d8073..0a7443b 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -19,6 +19,14 @@
 
 /* VXLAN header flags. */
 #define VXLAN_HF_VNI 0x08000000
+#define VXLAN_HF_RCO 0x00200000
+
+/* Remote checksum offload header option */
+#define VXLAN_RCO_MASK  0x7f    /* Last byte of vni field */
+#define VXLAN_RCO_UDP   0x80    /* Indicate UDP RCO (TCP when not set *) */
+#define VXLAN_RCO_SHIFT 1       /* Left shift of start */
+#define VXLAN_RCO_SHIFT_MASK ((1 << VXLAN_RCO_SHIFT) - 1)
+#define VXLAN_MAX_REMCSUM_START (VXLAN_RCO_MASK << VXLAN_RCO_SHIFT)
 
 #define VXLAN_N_VID     (1u << 24)
 #define VXLAN_VID_MASK  (VXLAN_N_VID - 1)
@@ -38,6 +46,7 @@
 	struct hlist_head vni_list[VNI_HASH_SIZE];
 	atomic_t	  refcnt;
 	struct udp_offload udp_offloads;
+	u32		  flags;
 };
 
 #define VXLAN_F_LEARN			0x01
@@ -49,6 +58,8 @@
 #define VXLAN_F_UDP_CSUM		0x40
 #define VXLAN_F_UDP_ZERO_CSUM6_TX	0x80
 #define VXLAN_F_UDP_ZERO_CSUM6_RX	0x100
+#define VXLAN_F_REMCSUM_TX		0x200
+#define VXLAN_F_REMCSUM_RX		0x400
 
 struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port,
 				  vxlan_rcv_t *rcv, void *data,