net/mlx4_core: Enable device recovery flow with SRIOV
In SRIOV, both the PF and the VF may attempt device recovery whenever they
assume that the device is not functioning. When the PF driver resets the
device, the VF should detect this and attempt to reinitialize itself.
The VF must be able to reset itself under all circumstances, even
if the PF is not responsive.
The VF shall reset itself in the following cases:
1. Commands are not processed within reasonable time over the communication channel.
This is done considering device state and the correct return code based on
the command as was done in the native mode, done in the next patch.
2. The VF driver receives an internal error event reported by the PF on the
communication channel. This occurs when the PF driver resets the device or
when VF is out of sync with the PF.
Add 'VF reset' capability, which allows the VF to reinitialize itself even when the
PF is not responsive.
As PF and VF may run their reset flow simulantanisly, there are several cases
that are handled:
- Prevent freeing VF resources upon FLR, when PF is in its unloading stage.
- Prevent PF getting VF commands before it has finished initializing its resources.
- Upon VF startup, check that comm-channel is online before sending
commands to the PF and getting timed-out.
Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 6bb0fca..1baf1f1 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -108,6 +108,8 @@
MLX4_FUNC_CAP_EQE_CQE_STRIDE | \
MLX4_FUNC_CAP_DMFS_A0_STATIC)
+#define RESET_PERSIST_MASK_FLAGS (MLX4_FLAG_SRIOV)
+
static char mlx4_version[] =
DRV_NAME ": Mellanox ConnectX core driver v"
DRV_VERSION " (" DRV_RELDATE ")\n";
@@ -1579,6 +1581,50 @@
}
}
+static int mlx4_comm_check_offline(struct mlx4_dev *dev)
+{
+#define COMM_CHAN_OFFLINE_OFFSET 0x09
+
+ u32 comm_flags;
+ u32 offline_bit;
+ unsigned long end;
+ struct mlx4_priv *priv = mlx4_priv(dev);
+
+ end = msecs_to_jiffies(MLX4_COMM_OFFLINE_TIME_OUT) + jiffies;
+ while (time_before(jiffies, end)) {
+ comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm +
+ MLX4_COMM_CHAN_FLAGS));
+ offline_bit = (comm_flags &
+ (u32)(1 << COMM_CHAN_OFFLINE_OFFSET));
+ if (!offline_bit)
+ return 0;
+ /* There are cases as part of AER/Reset flow that PF needs
+ * around 100 msec to load. We therefore sleep for 100 msec
+ * to allow other tasks to make use of that CPU during this
+ * time interval.
+ */
+ msleep(100);
+ }
+ mlx4_err(dev, "Communication channel is offline.\n");
+ return -EIO;
+}
+
+static void mlx4_reset_vf_support(struct mlx4_dev *dev)
+{
+#define COMM_CHAN_RST_OFFSET 0x1e
+
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ u32 comm_rst;
+ u32 comm_caps;
+
+ comm_caps = swab32(readl((__iomem char *)priv->mfunc.comm +
+ MLX4_COMM_CHAN_CAPS));
+ comm_rst = (comm_caps & (u32)(1 << COMM_CHAN_RST_OFFSET));
+
+ if (comm_rst)
+ dev->caps.vf_caps |= MLX4_VF_CAP_FLAG_RESET;
+}
+
static int mlx4_init_slave(struct mlx4_dev *dev)
{
struct mlx4_priv *priv = mlx4_priv(dev);
@@ -1594,6 +1640,12 @@
mutex_lock(&priv->cmd.slave_cmd_mutex);
priv->cmd.max_cmds = 1;
+ if (mlx4_comm_check_offline(dev)) {
+ mlx4_err(dev, "PF is not responsive, skipping initialization\n");
+ goto err_offline;
+ }
+
+ mlx4_reset_vf_support(dev);
mlx4_warn(dev, "Sending reset\n");
ret_from_reset = mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0,
MLX4_COMM_TIME);
@@ -1637,6 +1689,7 @@
err:
mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, 0);
+err_offline:
mutex_unlock(&priv->cmd.slave_cmd_mutex);
return -EIO;
}
@@ -2494,11 +2547,19 @@
!!((flags) & MLX4_FLAG_MASTER))
static u64 mlx4_enable_sriov(struct mlx4_dev *dev, struct pci_dev *pdev,
- u8 total_vfs, int existing_vfs)
+ u8 total_vfs, int existing_vfs, int reset_flow)
{
u64 dev_flags = dev->flags;
int err = 0;
+ if (reset_flow) {
+ dev->dev_vfs = kcalloc(total_vfs, sizeof(*dev->dev_vfs),
+ GFP_KERNEL);
+ if (!dev->dev_vfs)
+ goto free_mem;
+ return dev_flags;
+ }
+
atomic_inc(&pf_loading);
if (dev->flags & MLX4_FLAG_SRIOV) {
if (existing_vfs != total_vfs) {
@@ -2533,6 +2594,7 @@
disable_sriov:
atomic_dec(&pf_loading);
+free_mem:
dev->persist->num_vfs = 0;
kfree(dev->dev_vfs);
return dev_flags & ~MLX4_FLAG_MASTER;
@@ -2557,7 +2619,8 @@
}
static int mlx4_load_one(struct pci_dev *pdev, int pci_dev_data,
- int total_vfs, int *nvfs, struct mlx4_priv *priv)
+ int total_vfs, int *nvfs, struct mlx4_priv *priv,
+ int reset_flow)
{
struct mlx4_dev *dev;
unsigned sum = 0;
@@ -2679,8 +2742,10 @@
goto err_fw;
if (!(dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS)) {
- u64 dev_flags = mlx4_enable_sriov(dev, pdev, total_vfs,
- existing_vfs);
+ u64 dev_flags = mlx4_enable_sriov(dev, pdev,
+ total_vfs,
+ existing_vfs,
+ reset_flow);
mlx4_cmd_cleanup(dev, MLX4_CMD_CLEANUP_ALL);
dev->flags = dev_flags;
@@ -2722,7 +2787,7 @@
if (dev->flags & MLX4_FLAG_SRIOV) {
if (!existing_vfs)
pci_disable_sriov(pdev);
- if (mlx4_is_master(dev))
+ if (mlx4_is_master(dev) && !reset_flow)
atomic_dec(&pf_loading);
dev->flags &= ~MLX4_FLAG_SRIOV;
}
@@ -2736,7 +2801,8 @@
}
if (mlx4_is_master(dev) && (dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS)) {
- u64 dev_flags = mlx4_enable_sriov(dev, pdev, total_vfs, existing_vfs);
+ u64 dev_flags = mlx4_enable_sriov(dev, pdev, total_vfs,
+ existing_vfs, reset_flow);
if ((dev->flags ^ dev_flags) & (MLX4_FLAG_MASTER | MLX4_FLAG_SLAVE)) {
mlx4_cmd_cleanup(dev, MLX4_CMD_CLEANUP_VHCR);
@@ -2848,6 +2914,17 @@
goto err_steer;
mlx4_init_quotas(dev);
+ /* When PF resources are ready arm its comm channel to enable
+ * getting commands
+ */
+ if (mlx4_is_master(dev)) {
+ err = mlx4_ARM_COMM_CHANNEL(dev);
+ if (err) {
+ mlx4_err(dev, " Failed to arm comm channel eq: %x\n",
+ err);
+ goto err_steer;
+ }
+ }
for (port = 1; port <= dev->caps.num_ports; port++) {
err = mlx4_init_port_info(dev, port);
@@ -2866,7 +2943,7 @@
priv->removed = 0;
- if (mlx4_is_master(dev) && dev->persist->num_vfs)
+ if (mlx4_is_master(dev) && dev->persist->num_vfs && !reset_flow)
atomic_dec(&pf_loading);
kfree(dev_cap);
@@ -2925,10 +3002,12 @@
mlx4_cmd_cleanup(dev, MLX4_CMD_CLEANUP_ALL);
err_sriov:
- if (dev->flags & MLX4_FLAG_SRIOV && !existing_vfs)
+ if (dev->flags & MLX4_FLAG_SRIOV && !existing_vfs) {
pci_disable_sriov(pdev);
+ dev->flags &= ~MLX4_FLAG_SRIOV;
+ }
- if (mlx4_is_master(dev) && dev->persist->num_vfs)
+ if (mlx4_is_master(dev) && dev->persist->num_vfs && !reset_flow)
atomic_dec(&pf_loading);
kfree(priv->dev.dev_vfs);
@@ -3073,7 +3152,7 @@
if (err)
goto err_release_regions;
- err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv);
+ err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv, 0);
if (err)
goto err_catas;
@@ -3131,9 +3210,11 @@
{
struct mlx4_dev_persistent *persist = dev->persist;
struct mlx4_priv *priv = mlx4_priv(dev);
+ unsigned long flags = (dev->flags & RESET_PERSIST_MASK_FLAGS);
memset(priv, 0, sizeof(*priv));
priv->dev.persist = persist;
+ priv->dev.flags = flags;
}
static void mlx4_unload_one(struct pci_dev *pdev)
@@ -3143,7 +3224,6 @@
struct mlx4_priv *priv = mlx4_priv(dev);
int pci_dev_data;
int p, i;
- int active_vfs = 0;
if (priv->removed)
return;
@@ -3157,14 +3237,6 @@
pci_dev_data = priv->pci_dev_data;
- /* Disabling SR-IOV is not allowed while there are active vf's */
- if (mlx4_is_master(dev)) {
- active_vfs = mlx4_how_many_lives_vf(dev);
- if (active_vfs) {
- pr_warn("Removing PF when there are active VF's !!\n");
- pr_warn("Will not disable SR-IOV.\n");
- }
- }
mlx4_stop_sense(dev);
mlx4_unregister_device(dev);
@@ -3208,12 +3280,6 @@
if (dev->flags & MLX4_FLAG_MSI_X)
pci_disable_msix(pdev);
- if (dev->flags & MLX4_FLAG_SRIOV && !active_vfs) {
- mlx4_warn(dev, "Disabling SR-IOV\n");
- pci_disable_sriov(pdev);
- dev->flags &= ~MLX4_FLAG_SRIOV;
- dev->persist->num_vfs = 0;
- }
if (!mlx4_is_slave(dev))
mlx4_free_ownership(dev);
@@ -3235,11 +3301,21 @@
struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
struct mlx4_dev *dev = persist->dev;
struct mlx4_priv *priv = mlx4_priv(dev);
+ int active_vfs = 0;
mutex_lock(&persist->interface_state_mutex);
persist->interface_state |= MLX4_INTERFACE_STATE_DELETION;
mutex_unlock(&persist->interface_state_mutex);
+ /* Disabling SR-IOV is not allowed while there are active vf's */
+ if (mlx4_is_master(dev) && dev->flags & MLX4_FLAG_SRIOV) {
+ active_vfs = mlx4_how_many_lives_vf(dev);
+ if (active_vfs) {
+ pr_warn("Removing PF when there are active VF's !!\n");
+ pr_warn("Will not disable SR-IOV.\n");
+ }
+ }
+
/* device marked to be under deletion running now without the lock
* letting other tasks to be terminated
*/
@@ -3248,6 +3324,11 @@
else
mlx4_info(dev, "%s: interface is down\n", __func__);
mlx4_catas_end(dev);
+ if (dev->flags & MLX4_FLAG_SRIOV && !active_vfs) {
+ mlx4_warn(dev, "Disabling SR-IOV\n");
+ pci_disable_sriov(pdev);
+ }
+
pci_release_regions(pdev);
pci_disable_device(pdev);
kfree(dev->persist);
@@ -3287,7 +3368,7 @@
memcpy(nvfs, dev->persist->nvfs, sizeof(dev->persist->nvfs));
mlx4_unload_one(pdev);
- err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv);
+ err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv, 1);
if (err) {
mlx4_err(dev, "%s: ERROR: mlx4_load_one failed, pci_name=%s, err=%d\n",
__func__, pci_name(pdev), err);
@@ -3397,7 +3478,7 @@
mutex_lock(&persist->interface_state_mutex);
if (!(persist->interface_state & MLX4_INTERFACE_STATE_UP)) {
ret = mlx4_load_one(pdev, priv->pci_dev_data, total_vfs, nvfs,
- priv);
+ priv, 1);
if (ret) {
mlx4_err(dev, "%s: mlx4_load_one failed, ret=%d\n",
__func__, ret);