Roland Dreier | 225c7b1 | 2007-05-08 18:00:38 -0700 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. |
Jack Morgenstein | 51a379d | 2008-07-25 10:32:52 -0700 | [diff] [blame] | 3 | * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. |
Roland Dreier | 225c7b1 | 2007-05-08 18:00:38 -0700 | [diff] [blame] | 4 | * |
| 5 | * This software is available to you under a choice of one of two |
| 6 | * licenses. You may choose to be licensed under the terms of the GNU |
| 7 | * General Public License (GPL) Version 2, available from the file |
| 8 | * COPYING in the main directory of this source tree, or the |
| 9 | * OpenIB.org BSD license below: |
| 10 | * |
| 11 | * Redistribution and use in source and binary forms, with or |
| 12 | * without modification, are permitted provided that the following |
| 13 | * conditions are met: |
| 14 | * |
| 15 | * - Redistributions of source code must retain the above |
| 16 | * copyright notice, this list of conditions and the following |
| 17 | * disclaimer. |
| 18 | * |
| 19 | * - Redistributions in binary form must reproduce the above |
| 20 | * copyright notice, this list of conditions and the following |
| 21 | * disclaimer in the documentation and/or other materials |
| 22 | * provided with the distribution. |
| 23 | * |
| 24 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
| 25 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
| 26 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
| 27 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS |
| 28 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN |
| 29 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
| 30 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 31 | * SOFTWARE. |
| 32 | */ |
| 33 | |
Jack Morgenstein | ee49bd9 | 2007-07-12 17:50:45 +0300 | [diff] [blame] | 34 | #include <linux/workqueue.h> |
Paul Gortmaker | 9d9779e | 2011-07-03 15:21:01 -0400 | [diff] [blame] | 35 | #include <linux/module.h> |
Jack Morgenstein | ee49bd9 | 2007-07-12 17:50:45 +0300 | [diff] [blame] | 36 | |
Roland Dreier | 225c7b1 | 2007-05-08 18:00:38 -0700 | [diff] [blame] | 37 | #include "mlx4.h" |
| 38 | |
Jack Morgenstein | ee49bd9 | 2007-07-12 17:50:45 +0300 | [diff] [blame] | 39 | enum { |
| 40 | MLX4_CATAS_POLL_INTERVAL = 5 * HZ, |
| 41 | }; |
| 42 | |
Jack Morgenstein | ee49bd9 | 2007-07-12 17:50:45 +0300 | [diff] [blame] | 43 | |
Jack Morgenstein | ee49bd9 | 2007-07-12 17:50:45 +0300 | [diff] [blame] | 44 | |
Yishai Hadas | f5aef5a | 2015-01-25 16:59:39 +0200 | [diff] [blame] | 45 | int mlx4_internal_err_reset = 1; |
| 46 | module_param_named(internal_err_reset, mlx4_internal_err_reset, int, 0644); |
Jack Morgenstein | ee49bd9 | 2007-07-12 17:50:45 +0300 | [diff] [blame] | 47 | MODULE_PARM_DESC(internal_err_reset, |
Yishai Hadas | 55ad359 | 2015-01-25 16:59:42 +0200 | [diff] [blame] | 48 | "Reset device on internal errors if non-zero (default 1)"); |
Jack Morgenstein | ee49bd9 | 2007-07-12 17:50:45 +0300 | [diff] [blame] | 49 | |
Yishai Hadas | f6bc11e | 2015-01-25 16:59:38 +0200 | [diff] [blame] | 50 | static int read_vendor_id(struct mlx4_dev *dev) |
| 51 | { |
| 52 | u16 vendor_id = 0; |
| 53 | int ret; |
| 54 | |
| 55 | ret = pci_read_config_word(dev->persist->pdev, 0, &vendor_id); |
| 56 | if (ret) { |
| 57 | mlx4_err(dev, "Failed to read vendor ID, ret=%d\n", ret); |
| 58 | return ret; |
| 59 | } |
| 60 | |
| 61 | if (vendor_id == 0xffff) { |
| 62 | mlx4_err(dev, "PCI can't be accessed to read vendor id\n"); |
| 63 | return -EINVAL; |
| 64 | } |
| 65 | |
| 66 | return 0; |
| 67 | } |
| 68 | |
| 69 | static int mlx4_reset_master(struct mlx4_dev *dev) |
| 70 | { |
| 71 | int err = 0; |
| 72 | |
Yishai Hadas | 55ad359 | 2015-01-25 16:59:42 +0200 | [diff] [blame] | 73 | if (mlx4_is_master(dev)) |
| 74 | mlx4_report_internal_err_comm_event(dev); |
| 75 | |
Yishai Hadas | f6bc11e | 2015-01-25 16:59:38 +0200 | [diff] [blame] | 76 | if (!pci_channel_offline(dev->persist->pdev)) { |
| 77 | err = read_vendor_id(dev); |
| 78 | /* If PCI can't be accessed to read vendor ID we assume that its |
| 79 | * link was disabled and chip was already reset. |
| 80 | */ |
| 81 | if (err) |
| 82 | return 0; |
| 83 | |
| 84 | err = mlx4_reset(dev); |
| 85 | if (err) |
| 86 | mlx4_err(dev, "Fail to reset HCA\n"); |
| 87 | } |
| 88 | |
| 89 | return err; |
| 90 | } |
| 91 | |
Yishai Hadas | 55ad359 | 2015-01-25 16:59:42 +0200 | [diff] [blame] | 92 | static int mlx4_reset_slave(struct mlx4_dev *dev) |
| 93 | { |
| 94 | #define COM_CHAN_RST_REQ_OFFSET 0x10 |
| 95 | #define COM_CHAN_RST_ACK_OFFSET 0x08 |
| 96 | |
| 97 | u32 comm_flags; |
| 98 | u32 rst_req; |
| 99 | u32 rst_ack; |
| 100 | unsigned long end; |
| 101 | struct mlx4_priv *priv = mlx4_priv(dev); |
| 102 | |
| 103 | if (pci_channel_offline(dev->persist->pdev)) |
| 104 | return 0; |
| 105 | |
| 106 | comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm + |
| 107 | MLX4_COMM_CHAN_FLAGS)); |
| 108 | if (comm_flags == 0xffffffff) { |
| 109 | mlx4_err(dev, "VF reset is not needed\n"); |
| 110 | return 0; |
| 111 | } |
| 112 | |
| 113 | if (!(dev->caps.vf_caps & MLX4_VF_CAP_FLAG_RESET)) { |
| 114 | mlx4_err(dev, "VF reset is not supported\n"); |
| 115 | return -EOPNOTSUPP; |
| 116 | } |
| 117 | |
| 118 | rst_req = (comm_flags & (u32)(1 << COM_CHAN_RST_REQ_OFFSET)) >> |
| 119 | COM_CHAN_RST_REQ_OFFSET; |
| 120 | rst_ack = (comm_flags & (u32)(1 << COM_CHAN_RST_ACK_OFFSET)) >> |
| 121 | COM_CHAN_RST_ACK_OFFSET; |
| 122 | if (rst_req != rst_ack) { |
| 123 | mlx4_err(dev, "Communication channel isn't sync, fail to send reset\n"); |
| 124 | return -EIO; |
| 125 | } |
| 126 | |
| 127 | rst_req ^= 1; |
| 128 | mlx4_warn(dev, "VF is sending reset request to Firmware\n"); |
| 129 | comm_flags = rst_req << COM_CHAN_RST_REQ_OFFSET; |
| 130 | __raw_writel((__force u32)cpu_to_be32(comm_flags), |
| 131 | (__iomem char *)priv->mfunc.comm + MLX4_COMM_CHAN_FLAGS); |
| 132 | /* Make sure that our comm channel write doesn't |
| 133 | * get mixed in with writes from another CPU. |
| 134 | */ |
| 135 | mmiowb(); |
| 136 | |
| 137 | end = msecs_to_jiffies(MLX4_COMM_TIME) + jiffies; |
| 138 | while (time_before(jiffies, end)) { |
| 139 | comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm + |
| 140 | MLX4_COMM_CHAN_FLAGS)); |
| 141 | rst_ack = (comm_flags & (u32)(1 << COM_CHAN_RST_ACK_OFFSET)) >> |
| 142 | COM_CHAN_RST_ACK_OFFSET; |
| 143 | |
| 144 | /* Reading rst_req again since the communication channel can |
| 145 | * be reset at any time by the PF and all its bits will be |
| 146 | * set to zero. |
| 147 | */ |
| 148 | rst_req = (comm_flags & (u32)(1 << COM_CHAN_RST_REQ_OFFSET)) >> |
| 149 | COM_CHAN_RST_REQ_OFFSET; |
| 150 | |
| 151 | if (rst_ack == rst_req) { |
| 152 | mlx4_warn(dev, "VF Reset succeed\n"); |
| 153 | return 0; |
| 154 | } |
| 155 | cond_resched(); |
| 156 | } |
| 157 | mlx4_err(dev, "Fail to send reset over the communication channel\n"); |
| 158 | return -ETIMEDOUT; |
| 159 | } |
| 160 | |
| 161 | static int mlx4_comm_internal_err(u32 slave_read) |
| 162 | { |
| 163 | return (u32)COMM_CHAN_EVENT_INTERNAL_ERR == |
| 164 | (slave_read & (u32)COMM_CHAN_EVENT_INTERNAL_ERR) ? 1 : 0; |
| 165 | } |
| 166 | |
Yishai Hadas | f6bc11e | 2015-01-25 16:59:38 +0200 | [diff] [blame] | 167 | void mlx4_enter_error_state(struct mlx4_dev_persistent *persist) |
| 168 | { |
| 169 | int err; |
| 170 | struct mlx4_dev *dev; |
| 171 | |
Yishai Hadas | f5aef5a | 2015-01-25 16:59:39 +0200 | [diff] [blame] | 172 | if (!mlx4_internal_err_reset) |
Yishai Hadas | f6bc11e | 2015-01-25 16:59:38 +0200 | [diff] [blame] | 173 | return; |
| 174 | |
| 175 | mutex_lock(&persist->device_state_mutex); |
| 176 | if (persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) |
| 177 | goto out; |
| 178 | |
| 179 | dev = persist->dev; |
| 180 | mlx4_err(dev, "device is going to be reset\n"); |
Yishai Hadas | 55ad359 | 2015-01-25 16:59:42 +0200 | [diff] [blame] | 181 | if (mlx4_is_slave(dev)) |
| 182 | err = mlx4_reset_slave(dev); |
| 183 | else |
| 184 | err = mlx4_reset_master(dev); |
Yishai Hadas | f6bc11e | 2015-01-25 16:59:38 +0200 | [diff] [blame] | 185 | |
Daniel Jurgens | 22e3817 | 2016-02-17 17:24:25 +0200 | [diff] [blame] | 186 | if (!err) { |
| 187 | mlx4_err(dev, "device was reset successfully\n"); |
| 188 | } else { |
| 189 | /* EEH could have disabled the PCI channel during reset. That's |
| 190 | * recoverable and the PCI error flow will handle it. |
| 191 | */ |
| 192 | if (!pci_channel_offline(dev->persist->pdev)) |
| 193 | BUG_ON(1); |
| 194 | } |
Yishai Hadas | f6bc11e | 2015-01-25 16:59:38 +0200 | [diff] [blame] | 195 | dev->persist->state |= MLX4_DEVICE_STATE_INTERNAL_ERROR; |
Yishai Hadas | f6bc11e | 2015-01-25 16:59:38 +0200 | [diff] [blame] | 196 | mutex_unlock(&persist->device_state_mutex); |
| 197 | |
| 198 | /* At that step HW was already reset, now notify clients */ |
| 199 | mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0); |
Yishai Hadas | f5aef5a | 2015-01-25 16:59:39 +0200 | [diff] [blame] | 200 | mlx4_cmd_wake_completions(dev); |
Yishai Hadas | f6bc11e | 2015-01-25 16:59:38 +0200 | [diff] [blame] | 201 | return; |
| 202 | |
| 203 | out: |
| 204 | mutex_unlock(&persist->device_state_mutex); |
| 205 | } |
| 206 | |
| 207 | static void mlx4_handle_error_state(struct mlx4_dev_persistent *persist) |
| 208 | { |
| 209 | int err = 0; |
| 210 | |
| 211 | mlx4_enter_error_state(persist); |
Yishai Hadas | c69453e | 2015-01-25 16:59:40 +0200 | [diff] [blame] | 212 | mutex_lock(&persist->interface_state_mutex); |
| 213 | if (persist->interface_state & MLX4_INTERFACE_STATE_UP && |
| 214 | !(persist->interface_state & MLX4_INTERFACE_STATE_DELETION)) { |
| 215 | err = mlx4_restart_one(persist->pdev); |
| 216 | mlx4_info(persist->dev, "mlx4_restart_one was ended, ret=%d\n", |
| 217 | err); |
| 218 | } |
| 219 | mutex_unlock(&persist->interface_state_mutex); |
Yishai Hadas | f6bc11e | 2015-01-25 16:59:38 +0200 | [diff] [blame] | 220 | } |
| 221 | |
Jack Morgenstein | ee49bd9 | 2007-07-12 17:50:45 +0300 | [diff] [blame] | 222 | static void dump_err_buf(struct mlx4_dev *dev) |
Roland Dreier | 225c7b1 | 2007-05-08 18:00:38 -0700 | [diff] [blame] | 223 | { |
| 224 | struct mlx4_priv *priv = mlx4_priv(dev); |
| 225 | |
| 226 | int i; |
| 227 | |
Jack Morgenstein | ee49bd9 | 2007-07-12 17:50:45 +0300 | [diff] [blame] | 228 | mlx4_err(dev, "Internal error detected:\n"); |
Roland Dreier | 225c7b1 | 2007-05-08 18:00:38 -0700 | [diff] [blame] | 229 | for (i = 0; i < priv->fw.catas_size; ++i) |
| 230 | mlx4_err(dev, " buf[%02x]: %08x\n", |
| 231 | i, swab32(readl(priv->catas_err.map + i))); |
Roland Dreier | 225c7b1 | 2007-05-08 18:00:38 -0700 | [diff] [blame] | 232 | } |
| 233 | |
Jack Morgenstein | ee49bd9 | 2007-07-12 17:50:45 +0300 | [diff] [blame] | 234 | static void poll_catas(unsigned long dev_ptr) |
| 235 | { |
| 236 | struct mlx4_dev *dev = (struct mlx4_dev *) dev_ptr; |
| 237 | struct mlx4_priv *priv = mlx4_priv(dev); |
Yishai Hadas | 55ad359 | 2015-01-25 16:59:42 +0200 | [diff] [blame] | 238 | u32 slave_read; |
Jack Morgenstein | ee49bd9 | 2007-07-12 17:50:45 +0300 | [diff] [blame] | 239 | |
Yishai Hadas | 55ad359 | 2015-01-25 16:59:42 +0200 | [diff] [blame] | 240 | if (mlx4_is_slave(dev)) { |
| 241 | slave_read = swab32(readl(&priv->mfunc.comm->slave_read)); |
| 242 | if (mlx4_comm_internal_err(slave_read)) { |
| 243 | mlx4_warn(dev, "Internal error detected on the communication channel\n"); |
| 244 | goto internal_err; |
| 245 | } |
| 246 | } else if (readl(priv->catas_err.map)) { |
Yishai Hadas | f6bc11e | 2015-01-25 16:59:38 +0200 | [diff] [blame] | 247 | dump_err_buf(dev); |
| 248 | goto internal_err; |
| 249 | } |
Jack Morgenstein | ee49bd9 | 2007-07-12 17:50:45 +0300 | [diff] [blame] | 250 | |
Yishai Hadas | f6bc11e | 2015-01-25 16:59:38 +0200 | [diff] [blame] | 251 | if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) { |
| 252 | mlx4_warn(dev, "Internal error mark was detected on device\n"); |
| 253 | goto internal_err; |
| 254 | } |
| 255 | |
| 256 | mod_timer(&priv->catas_err.timer, |
| 257 | round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL)); |
| 258 | return; |
| 259 | |
| 260 | internal_err: |
Yishai Hadas | f5aef5a | 2015-01-25 16:59:39 +0200 | [diff] [blame] | 261 | if (mlx4_internal_err_reset) |
Yishai Hadas | f6bc11e | 2015-01-25 16:59:38 +0200 | [diff] [blame] | 262 | queue_work(dev->persist->catas_wq, &dev->persist->catas_work); |
Jack Morgenstein | ee49bd9 | 2007-07-12 17:50:45 +0300 | [diff] [blame] | 263 | } |
| 264 | |
| 265 | static void catas_reset(struct work_struct *work) |
| 266 | { |
Yishai Hadas | ad9a0bf | 2015-01-25 16:59:37 +0200 | [diff] [blame] | 267 | struct mlx4_dev_persistent *persist = |
| 268 | container_of(work, struct mlx4_dev_persistent, |
| 269 | catas_work); |
Jack Morgenstein | ee49bd9 | 2007-07-12 17:50:45 +0300 | [diff] [blame] | 270 | |
Yishai Hadas | f6bc11e | 2015-01-25 16:59:38 +0200 | [diff] [blame] | 271 | mlx4_handle_error_state(persist); |
Jack Morgenstein | ee49bd9 | 2007-07-12 17:50:45 +0300 | [diff] [blame] | 272 | } |
| 273 | |
| 274 | void mlx4_start_catas_poll(struct mlx4_dev *dev) |
Roland Dreier | 225c7b1 | 2007-05-08 18:00:38 -0700 | [diff] [blame] | 275 | { |
| 276 | struct mlx4_priv *priv = mlx4_priv(dev); |
Roland Dreier | 4979d18 | 2011-01-12 09:50:36 -0800 | [diff] [blame] | 277 | phys_addr_t addr; |
Roland Dreier | 225c7b1 | 2007-05-08 18:00:38 -0700 | [diff] [blame] | 278 | |
Jack Morgenstein | ee49bd9 | 2007-07-12 17:50:45 +0300 | [diff] [blame] | 279 | INIT_LIST_HEAD(&priv->catas_err.list); |
| 280 | init_timer(&priv->catas_err.timer); |
| 281 | priv->catas_err.map = NULL; |
| 282 | |
Yishai Hadas | 55ad359 | 2015-01-25 16:59:42 +0200 | [diff] [blame] | 283 | if (!mlx4_is_slave(dev)) { |
| 284 | addr = pci_resource_start(dev->persist->pdev, |
| 285 | priv->fw.catas_bar) + |
| 286 | priv->fw.catas_offset; |
Roland Dreier | 225c7b1 | 2007-05-08 18:00:38 -0700 | [diff] [blame] | 287 | |
Yishai Hadas | 55ad359 | 2015-01-25 16:59:42 +0200 | [diff] [blame] | 288 | priv->catas_err.map = ioremap(addr, priv->fw.catas_size * 4); |
| 289 | if (!priv->catas_err.map) { |
| 290 | mlx4_warn(dev, "Failed to map internal error buffer at 0x%llx\n", |
| 291 | (unsigned long long)addr); |
| 292 | return; |
| 293 | } |
Jack Morgenstein | ee49bd9 | 2007-07-12 17:50:45 +0300 | [diff] [blame] | 294 | } |
Roland Dreier | 225c7b1 | 2007-05-08 18:00:38 -0700 | [diff] [blame] | 295 | |
Jack Morgenstein | ee49bd9 | 2007-07-12 17:50:45 +0300 | [diff] [blame] | 296 | priv->catas_err.timer.data = (unsigned long) dev; |
| 297 | priv->catas_err.timer.function = poll_catas; |
| 298 | priv->catas_err.timer.expires = |
| 299 | round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL); |
| 300 | add_timer(&priv->catas_err.timer); |
Roland Dreier | 225c7b1 | 2007-05-08 18:00:38 -0700 | [diff] [blame] | 301 | } |
| 302 | |
Jack Morgenstein | ee49bd9 | 2007-07-12 17:50:45 +0300 | [diff] [blame] | 303 | void mlx4_stop_catas_poll(struct mlx4_dev *dev) |
Roland Dreier | 225c7b1 | 2007-05-08 18:00:38 -0700 | [diff] [blame] | 304 | { |
| 305 | struct mlx4_priv *priv = mlx4_priv(dev); |
| 306 | |
Jack Morgenstein | ee49bd9 | 2007-07-12 17:50:45 +0300 | [diff] [blame] | 307 | del_timer_sync(&priv->catas_err.timer); |
| 308 | |
Yishai Hadas | ad9a0bf | 2015-01-25 16:59:37 +0200 | [diff] [blame] | 309 | if (priv->catas_err.map) { |
Roland Dreier | 225c7b1 | 2007-05-08 18:00:38 -0700 | [diff] [blame] | 310 | iounmap(priv->catas_err.map); |
Yishai Hadas | ad9a0bf | 2015-01-25 16:59:37 +0200 | [diff] [blame] | 311 | priv->catas_err.map = NULL; |
| 312 | } |
Yishai Hadas | c69453e | 2015-01-25 16:59:40 +0200 | [diff] [blame] | 313 | |
| 314 | if (dev->persist->interface_state & MLX4_INTERFACE_STATE_DELETION) |
| 315 | flush_workqueue(dev->persist->catas_wq); |
Jack Morgenstein | ee49bd9 | 2007-07-12 17:50:45 +0300 | [diff] [blame] | 316 | } |
| 317 | |
Yishai Hadas | ad9a0bf | 2015-01-25 16:59:37 +0200 | [diff] [blame] | 318 | int mlx4_catas_init(struct mlx4_dev *dev) |
Jack Morgenstein | ee49bd9 | 2007-07-12 17:50:45 +0300 | [diff] [blame] | 319 | { |
Yishai Hadas | ad9a0bf | 2015-01-25 16:59:37 +0200 | [diff] [blame] | 320 | INIT_WORK(&dev->persist->catas_work, catas_reset); |
| 321 | dev->persist->catas_wq = create_singlethread_workqueue("mlx4_health"); |
| 322 | if (!dev->persist->catas_wq) |
| 323 | return -ENOMEM; |
| 324 | |
| 325 | return 0; |
| 326 | } |
| 327 | |
| 328 | void mlx4_catas_end(struct mlx4_dev *dev) |
| 329 | { |
| 330 | if (dev->persist->catas_wq) { |
| 331 | destroy_workqueue(dev->persist->catas_wq); |
| 332 | dev->persist->catas_wq = NULL; |
| 333 | } |
Roland Dreier | 225c7b1 | 2007-05-08 18:00:38 -0700 | [diff] [blame] | 334 | } |