net/mlx4_core: Enable device recovery flow with SRIOV
In SRIOV, both the PF and the VF may attempt device recovery whenever they
assume that the device is not functioning. When the PF driver resets the
device, the VF should detect this and attempt to reinitialize itself.
The VF must be able to reset itself under all circumstances, even
if the PF is not responsive.
The VF shall reset itself in the following cases:
1. Commands are not processed within reasonable time over the communication channel.
This is done considering device state and the correct return code based on
the command as was done in the native mode, done in the next patch.
2. The VF driver receives an internal error event reported by the PF on the
communication channel. This occurs when the PF driver resets the device or
when VF is out of sync with the PF.
Add 'VF reset' capability, which allows the VF to reinitialize itself even when the
PF is not responsive.
As PF and VF may run their reset flow simulantanisly, there are several cases
that are handled:
- Prevent freeing VF resources upon FLR, when PF is in its unloading stage.
- Prevent PF getting VF commands before it has finished initializing its resources.
- Upon VF startup, check that comm-channel is online before sending
commands to the PF and getting timed-out.
Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index 3895b2b..7652eed 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -42,6 +42,7 @@
#include <linux/mlx4/device.h>
#include <linux/semaphore.h>
#include <rdma/ib_smi.h>
+#include <linux/delay.h>
#include <asm/io.h>
@@ -729,7 +730,7 @@
EXPORT_SYMBOL_GPL(__mlx4_cmd);
-static int mlx4_ARM_COMM_CHANNEL(struct mlx4_dev *dev)
+int mlx4_ARM_COMM_CHANNEL(struct mlx4_dev *dev)
{
return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_ARM_COMM_CHANNEL,
MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
@@ -1945,8 +1946,11 @@
break;
case MLX4_COMM_CMD_VHCR_POST:
if ((slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR_EN) &&
- (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR_POST))
+ (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR_POST)) {
+ mlx4_warn(dev, "slave:%d is out of sync, cmd=0x%x, last command=0x%x, reset is needed\n",
+ slave, cmd, slave_state[slave].last_cmd);
goto reset_slave;
+ }
mutex_lock(&priv->cmd.slave_cmd_mutex);
if (mlx4_master_process_vhcr(dev, slave, NULL)) {
@@ -1980,7 +1984,18 @@
reset_slave:
/* cleanup any slave resources */
- mlx4_delete_all_resources_for_slave(dev, slave);
+ if (dev->persist->interface_state & MLX4_INTERFACE_STATE_UP)
+ mlx4_delete_all_resources_for_slave(dev, slave);
+
+ if (cmd != MLX4_COMM_CMD_RESET) {
+ mlx4_warn(dev, "Turn on internal error to force reset, slave=%d, cmd=0x%x\n",
+ slave, cmd);
+ /* Turn on internal error letting slave reset itself immeditaly,
+ * otherwise it might take till timeout on command is passed
+ */
+ reply |= ((u32)COMM_CHAN_EVENT_INTERNAL_ERR);
+ }
+
spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags);
if (!slave_state[slave].is_slave_going_down)
slave_state[slave].last_cmd = MLX4_COMM_CMD_RESET;
@@ -2056,17 +2071,28 @@
static int sync_toggles(struct mlx4_dev *dev)
{
struct mlx4_priv *priv = mlx4_priv(dev);
- int wr_toggle;
- int rd_toggle;
+ u32 wr_toggle;
+ u32 rd_toggle;
unsigned long end;
- wr_toggle = swab32(readl(&priv->mfunc.comm->slave_write)) >> 31;
- end = jiffies + msecs_to_jiffies(5000);
+ wr_toggle = swab32(readl(&priv->mfunc.comm->slave_write));
+ if (wr_toggle == 0xffffffff)
+ end = jiffies + msecs_to_jiffies(30000);
+ else
+ end = jiffies + msecs_to_jiffies(5000);
while (time_before(jiffies, end)) {
- rd_toggle = swab32(readl(&priv->mfunc.comm->slave_read)) >> 31;
- if (rd_toggle == wr_toggle) {
- priv->cmd.comm_toggle = rd_toggle;
+ rd_toggle = swab32(readl(&priv->mfunc.comm->slave_read));
+ if (wr_toggle == 0xffffffff || rd_toggle == 0xffffffff) {
+ /* PCI might be offline */
+ msleep(100);
+ wr_toggle = swab32(readl(&priv->mfunc.comm->
+ slave_write));
+ continue;
+ }
+
+ if (rd_toggle >> 31 == wr_toggle >> 31) {
+ priv->cmd.comm_toggle = rd_toggle >> 31;
return 0;
}
@@ -2172,13 +2198,6 @@
if (mlx4_init_resource_tracker(dev))
goto err_thread;
- err = mlx4_ARM_COMM_CHANNEL(dev);
- if (err) {
- mlx4_err(dev, " Failed to arm comm channel eq: %x\n",
- err);
- goto err_resource;
- }
-
} else {
err = sync_toggles(dev);
if (err) {
@@ -2188,8 +2207,6 @@
}
return 0;
-err_resource:
- mlx4_free_resource_tracker(dev, RES_TR_FREE_ALL);
err_thread:
flush_workqueue(priv->mfunc.master.comm_wq);
destroy_workqueue(priv->mfunc.master.comm_wq);
@@ -2266,6 +2283,27 @@
return -ENOMEM;
}
+void mlx4_report_internal_err_comm_event(struct mlx4_dev *dev)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ int slave;
+ u32 slave_read;
+
+ /* Report an internal error event to all
+ * communication channels.
+ */
+ for (slave = 0; slave < dev->num_slaves; slave++) {
+ slave_read = swab32(readl(&priv->mfunc.comm[slave].slave_read));
+ slave_read |= (u32)COMM_CHAN_EVENT_INTERNAL_ERR;
+ __raw_writel((__force u32)cpu_to_be32(slave_read),
+ &priv->mfunc.comm[slave].slave_read);
+ /* Make sure that our comm channel write doesn't
+ * get mixed in with writes from another CPU.
+ */
+ mmiowb();
+ }
+}
+
void mlx4_multi_func_cleanup(struct mlx4_dev *dev)
{
struct mlx4_priv *priv = mlx4_priv(dev);
@@ -2281,6 +2319,7 @@
kfree(priv->mfunc.master.slave_state);
kfree(priv->mfunc.master.vf_admin);
kfree(priv->mfunc.master.vf_oper);
+ dev->num_slaves = 0;
}
iounmap(priv->mfunc.comm);