hv_netvsc: Implement support for VF drivers on Hyper-V

Support VF drivers on Hyper-V. On Hyper-V, each VF instance presented to
the guest has an associated synthetic interface that shares the MAC address
with the VF instance. Typically these are bonded together to support
live migration. By default, the host delivers all the incoming packets
on the synthetic interface. Once the VF is up, we need to explicitly switch
the data path on the host to divert traffic onto the VF interface. Even after
switching the data path, broadcast and multicast packets are always delivered
on the synthetic interface and these will have to be injected back onto the
VF interface (if VF is up).
This patch implements the necessary support in netvsc to support Linux
VF drivers.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 8b3bd8e..6700a4d 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -202,6 +202,8 @@
 int rndis_filter_set_packet_filter(struct rndis_device *dev, u32 new_filter);
 int rndis_filter_set_device_mac(struct hv_device *hdev, char *mac);
 
+void netvsc_switch_datapath(struct netvsc_device *nv_dev, bool vf);
+
 #define NVSP_INVALID_PROTOCOL_VERSION	((u32)0xFFFFFFFF)
 
 #define NVSP_PROTOCOL_VERSION_1		2
@@ -641,6 +643,12 @@
 	u32 event;
 };
 
+struct garp_wrk {
+	struct work_struct dwrk;
+	struct net_device *netdev;
+	struct netvsc_device *netvsc_dev;
+};
+
 /* The context of the netvsc device  */
 struct net_device_context {
 	/* point back to our device context */
@@ -656,6 +664,7 @@
 
 	struct work_struct work;
 	u32 msg_enable; /* debug level */
+	struct garp_wrk gwrk;
 
 	struct netvsc_stats __percpu *tx_stats;
 	struct netvsc_stats __percpu *rx_stats;
@@ -730,6 +739,11 @@
 	u32 vf_alloc;
 	/* Serial number of the VF to team with */
 	u32 vf_serial;
+	atomic_t open_cnt;
+	/* State to manage the associated VF interface. */
+	bool vf_inject;
+	struct net_device *vf_netdev;
+	atomic_t vf_use_cnt;
 };
 
 /* NdisInitialize message */
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index ec313fc..eddce3c 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -33,6 +33,30 @@
 
 #include "hyperv_net.h"
 
+/*
+ * Switch the data path from the synthetic interface to the VF
+ * interface.
+ */
+void netvsc_switch_datapath(struct netvsc_device *nv_dev, bool vf)
+{
+	struct nvsp_message *init_pkt = &nv_dev->channel_init_pkt;
+	struct hv_device *dev = nv_dev->dev;
+
+	memset(init_pkt, 0, sizeof(struct nvsp_message));
+	init_pkt->hdr.msg_type = NVSP_MSG4_TYPE_SWITCH_DATA_PATH;
+	if (vf)
+		init_pkt->msg.v4_msg.active_dp.active_datapath =
+			NVSP_DATAPATH_VF;
+	else
+		init_pkt->msg.v4_msg.active_dp.active_datapath =
+			NVSP_DATAPATH_SYNTHETIC;
+
+	vmbus_sendpacket(dev->channel, init_pkt,
+			       sizeof(struct nvsp_message),
+			       (unsigned long)init_pkt,
+			       VM_PKT_DATA_INBAND, 0);
+}
+
 
 static struct netvsc_device *alloc_net_device(struct hv_device *device)
 {
@@ -52,11 +76,16 @@
 	init_waitqueue_head(&net_device->wait_drain);
 	net_device->start_remove = false;
 	net_device->destroy = false;
+	atomic_set(&net_device->open_cnt, 0);
+	atomic_set(&net_device->vf_use_cnt, 0);
 	net_device->dev = device;
 	net_device->ndev = ndev;
 	net_device->max_pkt = RNDIS_MAX_PKT_DEFAULT;
 	net_device->pkt_align = RNDIS_PKT_ALIGN_DEFAULT;
 
+	net_device->vf_netdev = NULL;
+	net_device->vf_inject = false;
+
 	hv_set_drvdata(device, net_device);
 	return net_device;
 }
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index b8121eb..bfdb568a 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -610,42 +610,24 @@
 	schedule_delayed_work(&ndev_ctx->dwork, 0);
 }
 
-/*
- * netvsc_recv_callback -  Callback when we receive a packet from the
- * "wire" on the specified device.
- */
-int netvsc_recv_callback(struct hv_device *device_obj,
+
+static struct sk_buff *netvsc_alloc_recv_skb(struct net_device *net,
 				struct hv_netvsc_packet *packet,
-				void **data,
 				struct ndis_tcp_ip_checksum_info *csum_info,
-				struct vmbus_channel *channel,
-				u16 vlan_tci)
+				void *data, u16 vlan_tci)
 {
-	struct net_device *net;
-	struct net_device_context *net_device_ctx;
 	struct sk_buff *skb;
-	struct netvsc_stats *rx_stats;
 
-	net = ((struct netvsc_device *)hv_get_drvdata(device_obj))->ndev;
-	if (!net || net->reg_state != NETREG_REGISTERED) {
-		return NVSP_STAT_FAIL;
-	}
-	net_device_ctx = netdev_priv(net);
-	rx_stats = this_cpu_ptr(net_device_ctx->rx_stats);
-
-	/* Allocate a skb - TODO direct I/O to pages? */
 	skb = netdev_alloc_skb_ip_align(net, packet->total_data_buflen);
-	if (unlikely(!skb)) {
-		++net->stats.rx_dropped;
-		return NVSP_STAT_FAIL;
-	}
+	if (!skb)
+		return skb;
 
 	/*
 	 * Copy to skb. This copy is needed here since the memory pointed by
 	 * hv_netvsc_packet cannot be deallocated
 	 */
-	memcpy(skb_put(skb, packet->total_data_buflen), *data,
-		packet->total_data_buflen);
+	memcpy(skb_put(skb, packet->total_data_buflen), data,
+	       packet->total_data_buflen);
 
 	skb->protocol = eth_type_trans(skb, net);
 	if (csum_info) {
@@ -663,6 +645,75 @@
 		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
 				       vlan_tci);
 
+	return skb;
+}
+
+/*
+ * netvsc_recv_callback -  Callback when we receive a packet from the
+ * "wire" on the specified device.
+ */
+int netvsc_recv_callback(struct hv_device *device_obj,
+				struct hv_netvsc_packet *packet,
+				void **data,
+				struct ndis_tcp_ip_checksum_info *csum_info,
+				struct vmbus_channel *channel,
+				u16 vlan_tci)
+{
+	struct net_device *net;
+	struct net_device_context *net_device_ctx;
+	struct sk_buff *skb;
+	struct sk_buff *vf_skb;
+	struct netvsc_stats *rx_stats;
+	struct netvsc_device *netvsc_dev = hv_get_drvdata(device_obj);
+	u32 bytes_recvd = packet->total_data_buflen;
+	int ret = 0;
+
+	net = netvsc_dev->ndev;
+	if (!net || net->reg_state != NETREG_REGISTERED)
+		return NVSP_STAT_FAIL;
+
+	if (READ_ONCE(netvsc_dev->vf_inject)) {
+		atomic_inc(&netvsc_dev->vf_use_cnt);
+		if (!READ_ONCE(netvsc_dev->vf_inject)) {
+			/*
+			 * We raced; just move on.
+			 */
+			atomic_dec(&netvsc_dev->vf_use_cnt);
+			goto vf_injection_done;
+		}
+
+		/*
+		 * Inject this packet into the VF inerface.
+		 * On Hyper-V, multicast and brodcast packets
+		 * are only delivered on the synthetic interface
+		 * (after subjecting these to policy filters on
+		 * the host). Deliver these via the VF interface
+		 * in the guest.
+		 */
+		vf_skb = netvsc_alloc_recv_skb(netvsc_dev->vf_netdev, packet,
+					       csum_info, *data, vlan_tci);
+		if (vf_skb != NULL) {
+			++netvsc_dev->vf_netdev->stats.rx_packets;
+			netvsc_dev->vf_netdev->stats.rx_bytes += bytes_recvd;
+			netif_receive_skb(vf_skb);
+		} else {
+			++net->stats.rx_dropped;
+			ret = NVSP_STAT_FAIL;
+		}
+		atomic_dec(&netvsc_dev->vf_use_cnt);
+		return ret;
+	}
+
+vf_injection_done:
+	net_device_ctx = netdev_priv(net);
+	rx_stats = this_cpu_ptr(net_device_ctx->rx_stats);
+
+	/* Allocate a skb - TODO direct I/O to pages? */
+	skb = netvsc_alloc_recv_skb(net, packet, csum_info, *data, vlan_tci);
+	if (unlikely(!skb)) {
+		++net->stats.rx_dropped;
+		return NVSP_STAT_FAIL;
+	}
 	skb_record_rx_queue(skb, channel->
 			    offermsg.offer.sub_channel_index);
 
@@ -1102,6 +1153,175 @@
 	free_netdev(netdev);
 }
 
+static void netvsc_notify_peers(struct work_struct *wrk)
+{
+	struct garp_wrk *gwrk;
+
+	gwrk = container_of(wrk, struct garp_wrk, dwrk);
+
+	netdev_notify_peers(gwrk->netdev);
+
+	atomic_dec(&gwrk->netvsc_dev->vf_use_cnt);
+}
+
+static struct netvsc_device *get_netvsc_device(char *mac)
+{
+	struct net_device *dev;
+	struct net_device_context *netvsc_ctx = NULL;
+	int rtnl_locked;
+
+	rtnl_locked = rtnl_trylock();
+
+	for_each_netdev(&init_net, dev) {
+		if (memcmp(dev->dev_addr, mac, ETH_ALEN) == 0) {
+			if (dev->netdev_ops != &device_ops)
+				continue;
+			netvsc_ctx = netdev_priv(dev);
+			break;
+		}
+	}
+	if (rtnl_locked)
+		rtnl_unlock();
+
+	if (netvsc_ctx == NULL)
+		return NULL;
+
+	return hv_get_drvdata(netvsc_ctx->device_ctx);
+}
+
+static int netvsc_register_vf(struct net_device *vf_netdev)
+{
+	struct netvsc_device *netvsc_dev;
+	const struct ethtool_ops *eth_ops = vf_netdev->ethtool_ops;
+
+	if (eth_ops == NULL || eth_ops == &ethtool_ops)
+		return NOTIFY_DONE;
+
+	/*
+	 * We will use the MAC address to locate the synthetic interface to
+	 * associate with the VF interface. If we don't find a matching
+	 * synthetic interface, move on.
+	 */
+	netvsc_dev = get_netvsc_device(vf_netdev->dev_addr);
+	if (netvsc_dev == NULL)
+		return NOTIFY_DONE;
+
+	netdev_info(netvsc_dev->ndev, "VF registering: %s\n", vf_netdev->name);
+	/*
+	 * Take a reference on the module.
+	 */
+	try_module_get(THIS_MODULE);
+	netvsc_dev->vf_netdev = vf_netdev;
+	return NOTIFY_OK;
+}
+
+
+static int netvsc_vf_up(struct net_device *vf_netdev)
+{
+	struct netvsc_device *netvsc_dev;
+	const struct ethtool_ops *eth_ops = vf_netdev->ethtool_ops;
+	struct net_device_context *net_device_ctx;
+
+	if (eth_ops == &ethtool_ops)
+		return NOTIFY_DONE;
+
+	netvsc_dev = get_netvsc_device(vf_netdev->dev_addr);
+
+	if ((netvsc_dev == NULL) || (netvsc_dev->vf_netdev == NULL))
+		return NOTIFY_DONE;
+
+	netdev_info(netvsc_dev->ndev, "VF up: %s\n", vf_netdev->name);
+	net_device_ctx = netdev_priv(netvsc_dev->ndev);
+	netvsc_dev->vf_inject = true;
+
+	/*
+	 * Open the device before switching data path.
+	 */
+	rndis_filter_open(net_device_ctx->device_ctx);
+
+	/*
+	 * notify the host to switch the data path.
+	 */
+	netvsc_switch_datapath(netvsc_dev, true);
+	netdev_info(netvsc_dev->ndev, "Data path switched to VF: %s\n",
+		    vf_netdev->name);
+
+	netif_carrier_off(netvsc_dev->ndev);
+
+	/*
+	 * Now notify peers. We are scheduling work to
+	 * notify peers; take a reference to prevent
+	 * the VF interface from vanishing.
+	 */
+	atomic_inc(&netvsc_dev->vf_use_cnt);
+	net_device_ctx->gwrk.netdev = vf_netdev;
+	net_device_ctx->gwrk.netvsc_dev = netvsc_dev;
+	schedule_work(&net_device_ctx->gwrk.dwrk);
+
+	return NOTIFY_OK;
+}
+
+
+static int netvsc_vf_down(struct net_device *vf_netdev)
+{
+	struct netvsc_device *netvsc_dev;
+	struct net_device_context *net_device_ctx;
+	const struct ethtool_ops *eth_ops = vf_netdev->ethtool_ops;
+
+	if (eth_ops == &ethtool_ops)
+		return NOTIFY_DONE;
+
+	netvsc_dev = get_netvsc_device(vf_netdev->dev_addr);
+
+	if ((netvsc_dev == NULL) || (netvsc_dev->vf_netdev == NULL))
+		return NOTIFY_DONE;
+
+	netdev_info(netvsc_dev->ndev, "VF down: %s\n", vf_netdev->name);
+	net_device_ctx = netdev_priv(netvsc_dev->ndev);
+	netvsc_dev->vf_inject = false;
+	/*
+	 * Wait for currently active users to
+	 * drain out.
+	 */
+
+	while (atomic_read(&netvsc_dev->vf_use_cnt) != 0)
+		udelay(50);
+	netvsc_switch_datapath(netvsc_dev, false);
+	netdev_info(netvsc_dev->ndev, "Data path switched from VF: %s\n",
+		    vf_netdev->name);
+	rndis_filter_close(net_device_ctx->device_ctx);
+	netif_carrier_on(netvsc_dev->ndev);
+	/*
+	 * Notify peers.
+	 */
+	atomic_inc(&netvsc_dev->vf_use_cnt);
+	net_device_ctx->gwrk.netdev = netvsc_dev->ndev;
+	net_device_ctx->gwrk.netvsc_dev = netvsc_dev;
+	schedule_work(&net_device_ctx->gwrk.dwrk);
+
+	return NOTIFY_OK;
+}
+
+
+static int netvsc_unregister_vf(struct net_device *vf_netdev)
+{
+	struct netvsc_device *netvsc_dev;
+	const struct ethtool_ops *eth_ops = vf_netdev->ethtool_ops;
+
+	if (eth_ops == &ethtool_ops)
+		return NOTIFY_DONE;
+
+	netvsc_dev = get_netvsc_device(vf_netdev->dev_addr);
+	if (netvsc_dev == NULL)
+		return NOTIFY_DONE;
+	netdev_info(netvsc_dev->ndev, "VF unregistering: %s\n",
+		    vf_netdev->name);
+
+	netvsc_dev->vf_netdev = NULL;
+	module_put(THIS_MODULE);
+	return NOTIFY_OK;
+}
+
 static int netvsc_probe(struct hv_device *dev,
 			const struct hv_vmbus_device_id *dev_id)
 {
@@ -1140,6 +1360,7 @@
 	hv_set_drvdata(dev, net);
 	INIT_DELAYED_WORK(&net_device_ctx->dwork, netvsc_link_change);
 	INIT_WORK(&net_device_ctx->work, do_set_multicast);
+	INIT_WORK(&net_device_ctx->gwrk.dwrk, netvsc_notify_peers);
 
 	spin_lock_init(&net_device_ctx->lock);
 	INIT_LIST_HEAD(&net_device_ctx->reconfig_events);
@@ -1235,19 +1456,58 @@
 	.remove = netvsc_remove,
 };
 
+
+/*
+ * On Hyper-V, every VF interface is matched with a corresponding
+ * synthetic interface. The synthetic interface is presented first
+ * to the guest. When the corresponding VF instance is registered,
+ * we will take care of switching the data path.
+ */
+static int netvsc_netdev_event(struct notifier_block *this,
+			       unsigned long event, void *ptr)
+{
+	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		return netvsc_register_vf(event_dev);
+	case NETDEV_UNREGISTER:
+		return netvsc_unregister_vf(event_dev);
+	case NETDEV_UP:
+		return netvsc_vf_up(event_dev);
+	case NETDEV_DOWN:
+		return netvsc_vf_down(event_dev);
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
+static struct notifier_block netvsc_netdev_notifier = {
+	.notifier_call = netvsc_netdev_event,
+};
+
 static void __exit netvsc_drv_exit(void)
 {
+	unregister_netdevice_notifier(&netvsc_netdev_notifier);
 	vmbus_driver_unregister(&netvsc_drv);
 }
 
 static int __init netvsc_drv_init(void)
 {
+	int ret;
+
 	if (ring_size < RING_SIZE_MIN) {
 		ring_size = RING_SIZE_MIN;
 		pr_info("Increased ring_size to %d (min allowed)\n",
 			ring_size);
 	}
-	return vmbus_driver_register(&netvsc_drv);
+	ret = vmbus_driver_register(&netvsc_drv);
+
+	if (ret)
+		return ret;
+
+	register_netdevice_notifier(&netvsc_netdev_notifier);
+	return 0;
 }
 
 MODULE_LICENSE("GPL");
diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index c4e1e04..a59cdeb 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -1229,6 +1229,9 @@
 	if (!net_device)
 		return -EINVAL;
 
+	if (atomic_inc_return(&net_device->open_cnt) != 1)
+		return 0;
+
 	return rndis_filter_open_device(net_device->extension);
 }
 
@@ -1239,5 +1242,8 @@
 	if (!nvdev)
 		return -EINVAL;
 
+	if (atomic_dec_return(&nvdev->open_cnt) != 0)
+		return 0;
+
 	return rndis_filter_close_device(nvdev->extension);
 }