Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 1 | /* |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 2 | * Copyright (C) 2001 Dave Engebretsen IBM Corporation |
Michael Ellerman | d995310 | 2005-10-24 15:07:30 +1000 | [diff] [blame] | 3 | * |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 4 | * This program is free software; you can redistribute it and/or modify |
| 5 | * it under the terms of the GNU General Public License as published by |
| 6 | * the Free Software Foundation; either version 2 of the License, or |
| 7 | * (at your option) any later version. |
Michael Ellerman | d995310 | 2005-10-24 15:07:30 +1000 | [diff] [blame] | 8 | * |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 9 | * This program is distributed in the hope that it will be useful, |
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 12 | * GNU General Public License for more details. |
Michael Ellerman | d995310 | 2005-10-24 15:07:30 +1000 | [diff] [blame] | 13 | * |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 14 | * You should have received a copy of the GNU General Public License |
| 15 | * along with this program; if not, write to the Free Software |
| 16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
| 17 | */ |
| 18 | |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 19 | #include <linux/sched.h> |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 20 | #include <linux/interrupt.h> |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 21 | #include <linux/irq.h> |
Anton Blanchard | 9012899 | 2012-03-21 15:59:04 +0000 | [diff] [blame] | 22 | #include <linux/of.h> |
Anton Blanchard | 55fc0c5 | 2012-03-21 15:49:59 +0000 | [diff] [blame] | 23 | #include <linux/fs.h> |
| 24 | #include <linux/reboot.h> |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 25 | |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 26 | #include <asm/machdep.h> |
| 27 | #include <asm/rtas.h> |
Michael Ellerman | 8c4f1f2 | 2005-12-04 18:39:33 +1100 | [diff] [blame] | 28 | #include <asm/firmware.h> |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 29 | |
Michael Ellerman | 577830b | 2007-02-08 18:33:51 +1100 | [diff] [blame] | 30 | #include "pseries.h" |
Arnd Bergmann | c902be7 | 2006-01-04 19:55:53 +0000 | [diff] [blame] | 31 | |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 32 | static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX]; |
| 33 | static DEFINE_SPINLOCK(ras_log_buf_lock); |
| 34 | |
Anton Blanchard | d368514 | 2011-01-11 19:50:51 +0000 | [diff] [blame] | 35 | static char global_mce_data_buf[RTAS_ERROR_LOG_MAX]; |
| 36 | static DEFINE_PER_CPU(__u64, mce_data_buf); |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 37 | |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 38 | static int ras_check_exception_token; |
| 39 | |
| 40 | #define EPOW_SENSOR_TOKEN 9 |
| 41 | #define EPOW_SENSOR_INDEX 0 |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 42 | |
David Howells | 7d12e78 | 2006-10-05 14:55:46 +0100 | [diff] [blame] | 43 | static irqreturn_t ras_epow_interrupt(int irq, void *dev_id); |
| 44 | static irqreturn_t ras_error_interrupt(int irq, void *dev_id); |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 45 | |
Benjamin Herrenschmidt | 0ebfff1 | 2006-07-03 21:36:01 +1000 | [diff] [blame] | 46 | |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 47 | /* |
| 48 | * Initialize handlers for the set of interrupts caused by hardware errors |
| 49 | * and power system events. |
| 50 | */ |
| 51 | static int __init init_ras_IRQ(void) |
| 52 | { |
| 53 | struct device_node *np; |
| 54 | |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 55 | ras_check_exception_token = rtas_token("check-exception"); |
| 56 | |
| 57 | /* Internal Errors */ |
| 58 | np = of_find_node_by_path("/event-sources/internal-errors"); |
| 59 | if (np != NULL) { |
Mark Nelson | 32c96f7 | 2010-05-18 22:51:00 +0000 | [diff] [blame] | 60 | request_event_sources_irqs(np, ras_error_interrupt, |
| 61 | "RAS_ERROR"); |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 62 | of_node_put(np); |
| 63 | } |
| 64 | |
| 65 | /* EPOW Events */ |
| 66 | np = of_find_node_by_path("/event-sources/epow-events"); |
| 67 | if (np != NULL) { |
Mark Nelson | 32c96f7 | 2010-05-18 22:51:00 +0000 | [diff] [blame] | 68 | request_event_sources_irqs(np, ras_epow_interrupt, "RAS_EPOW"); |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 69 | of_node_put(np); |
| 70 | } |
| 71 | |
Anton Blanchard | 69ed332 | 2006-03-28 14:08:39 +1100 | [diff] [blame] | 72 | return 0; |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 73 | } |
Anton Blanchard | 55fc0c5 | 2012-03-21 15:49:59 +0000 | [diff] [blame] | 74 | subsys_initcall(init_ras_IRQ); |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 75 | |
Anton Blanchard | 55fc0c5 | 2012-03-21 15:49:59 +0000 | [diff] [blame] | 76 | #define EPOW_SHUTDOWN_NORMAL 1 |
| 77 | #define EPOW_SHUTDOWN_ON_UPS 2 |
| 78 | #define EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS 3 |
| 79 | #define EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH 4 |
| 80 | |
| 81 | static void handle_system_shutdown(char event_modifier) |
| 82 | { |
| 83 | switch (event_modifier) { |
| 84 | case EPOW_SHUTDOWN_NORMAL: |
| 85 | pr_emerg("Firmware initiated power off"); |
liguang | 1b7e0cb | 2013-05-30 15:20:33 +0800 | [diff] [blame] | 86 | orderly_poweroff(true); |
Anton Blanchard | 55fc0c5 | 2012-03-21 15:49:59 +0000 | [diff] [blame] | 87 | break; |
| 88 | |
| 89 | case EPOW_SHUTDOWN_ON_UPS: |
| 90 | pr_emerg("Loss of power reported by firmware, system is " |
| 91 | "running on UPS/battery"); |
| 92 | break; |
| 93 | |
| 94 | case EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS: |
| 95 | pr_emerg("Loss of system critical functions reported by " |
| 96 | "firmware"); |
| 97 | pr_emerg("Check RTAS error log for details"); |
liguang | 1b7e0cb | 2013-05-30 15:20:33 +0800 | [diff] [blame] | 98 | orderly_poweroff(true); |
Anton Blanchard | 55fc0c5 | 2012-03-21 15:49:59 +0000 | [diff] [blame] | 99 | break; |
| 100 | |
| 101 | case EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH: |
| 102 | pr_emerg("Ambient temperature too high reported by firmware"); |
| 103 | pr_emerg("Check RTAS error log for details"); |
liguang | 1b7e0cb | 2013-05-30 15:20:33 +0800 | [diff] [blame] | 104 | orderly_poweroff(true); |
Anton Blanchard | 55fc0c5 | 2012-03-21 15:49:59 +0000 | [diff] [blame] | 105 | break; |
| 106 | |
| 107 | default: |
| 108 | pr_err("Unknown power/cooling shutdown event (modifier %d)", |
| 109 | event_modifier); |
| 110 | } |
| 111 | } |
| 112 | |
| 113 | struct epow_errorlog { |
| 114 | unsigned char sensor_value; |
| 115 | unsigned char event_modifier; |
| 116 | unsigned char extended_modifier; |
| 117 | unsigned char reserved; |
| 118 | unsigned char platform_reason; |
| 119 | }; |
| 120 | |
| 121 | #define EPOW_RESET 0 |
| 122 | #define EPOW_WARN_COOLING 1 |
| 123 | #define EPOW_WARN_POWER 2 |
| 124 | #define EPOW_SYSTEM_SHUTDOWN 3 |
| 125 | #define EPOW_SYSTEM_HALT 4 |
| 126 | #define EPOW_MAIN_ENCLOSURE 5 |
| 127 | #define EPOW_POWER_OFF 7 |
| 128 | |
| 129 | void rtas_parse_epow_errlog(struct rtas_error_log *log) |
| 130 | { |
| 131 | struct pseries_errorlog *pseries_log; |
| 132 | struct epow_errorlog *epow_log; |
| 133 | char action_code; |
| 134 | char modifier; |
| 135 | |
| 136 | pseries_log = get_pseries_errorlog(log, PSERIES_ELOG_SECT_ID_EPOW); |
| 137 | if (pseries_log == NULL) |
| 138 | return; |
| 139 | |
| 140 | epow_log = (struct epow_errorlog *)pseries_log->data; |
| 141 | action_code = epow_log->sensor_value & 0xF; /* bottom 4 bits */ |
| 142 | modifier = epow_log->event_modifier & 0xF; /* bottom 4 bits */ |
| 143 | |
| 144 | switch (action_code) { |
| 145 | case EPOW_RESET: |
| 146 | pr_err("Non critical power or cooling issue cleared"); |
| 147 | break; |
| 148 | |
| 149 | case EPOW_WARN_COOLING: |
| 150 | pr_err("Non critical cooling issue reported by firmware"); |
| 151 | pr_err("Check RTAS error log for details"); |
| 152 | break; |
| 153 | |
| 154 | case EPOW_WARN_POWER: |
| 155 | pr_err("Non critical power issue reported by firmware"); |
| 156 | pr_err("Check RTAS error log for details"); |
| 157 | break; |
| 158 | |
| 159 | case EPOW_SYSTEM_SHUTDOWN: |
| 160 | handle_system_shutdown(epow_log->event_modifier); |
| 161 | break; |
| 162 | |
| 163 | case EPOW_SYSTEM_HALT: |
| 164 | pr_emerg("Firmware initiated power off"); |
liguang | 1b7e0cb | 2013-05-30 15:20:33 +0800 | [diff] [blame] | 165 | orderly_poweroff(true); |
Anton Blanchard | 55fc0c5 | 2012-03-21 15:49:59 +0000 | [diff] [blame] | 166 | break; |
| 167 | |
| 168 | case EPOW_MAIN_ENCLOSURE: |
| 169 | case EPOW_POWER_OFF: |
| 170 | pr_emerg("Critical power/cooling issue reported by firmware"); |
| 171 | pr_emerg("Check RTAS error log for details"); |
| 172 | pr_emerg("Immediate power off"); |
| 173 | emergency_sync(); |
| 174 | kernel_power_off(); |
| 175 | break; |
| 176 | |
| 177 | default: |
| 178 | pr_err("Unknown power/cooling event (action code %d)", |
| 179 | action_code); |
| 180 | } |
| 181 | } |
| 182 | |
| 183 | /* Handle environmental and power warning (EPOW) interrupts. */ |
David Howells | 7d12e78 | 2006-10-05 14:55:46 +0100 | [diff] [blame] | 184 | static irqreturn_t ras_epow_interrupt(int irq, void *dev_id) |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 185 | { |
Anton Blanchard | 55fc0c5 | 2012-03-21 15:49:59 +0000 | [diff] [blame] | 186 | int status; |
| 187 | int state; |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 188 | int critical; |
| 189 | |
Anton Blanchard | 587f83e | 2012-03-21 15:53:43 +0000 | [diff] [blame] | 190 | status = rtas_get_sensor(EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX, &state); |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 191 | |
| 192 | if (state > 3) |
Anton Blanchard | 55fc0c5 | 2012-03-21 15:49:59 +0000 | [diff] [blame] | 193 | critical = 1; /* Time Critical */ |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 194 | else |
| 195 | critical = 0; |
| 196 | |
| 197 | spin_lock(&ras_log_buf_lock); |
| 198 | |
| 199 | status = rtas_call(ras_check_exception_token, 6, 1, NULL, |
Mark Nelson | b08e281 | 2010-05-26 21:40:39 +0000 | [diff] [blame] | 200 | RTAS_VECTOR_EXTERNAL_INTERRUPT, |
Grant Likely | 476eb49 | 2011-05-04 15:02:15 +1000 | [diff] [blame] | 201 | virq_to_hw(irq), |
Anton Blanchard | 6f43747 | 2012-03-21 15:56:49 +0000 | [diff] [blame] | 202 | RTAS_EPOW_WARNING, |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 203 | critical, __pa(&ras_log_buf), |
| 204 | rtas_get_error_log_max()); |
| 205 | |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 206 | log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0); |
| 207 | |
Anton Blanchard | 55fc0c5 | 2012-03-21 15:49:59 +0000 | [diff] [blame] | 208 | rtas_parse_epow_errlog((struct rtas_error_log *)ras_log_buf); |
| 209 | |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 210 | spin_unlock(&ras_log_buf_lock); |
| 211 | return IRQ_HANDLED; |
| 212 | } |
| 213 | |
| 214 | /* |
| 215 | * Handle hardware error interrupts. |
| 216 | * |
| 217 | * RTAS check-exception is called to collect data on the exception. If |
| 218 | * the error is deemed recoverable, we log a warning and return. |
| 219 | * For nonrecoverable errors, an error is logged and we stop all processing |
| 220 | * as quickly as possible in order to prevent propagation of the failure. |
| 221 | */ |
David Howells | 7d12e78 | 2006-10-05 14:55:46 +0100 | [diff] [blame] | 222 | static irqreturn_t ras_error_interrupt(int irq, void *dev_id) |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 223 | { |
| 224 | struct rtas_error_log *rtas_elog; |
Anton Blanchard | cc8b526 | 2012-03-21 15:58:03 +0000 | [diff] [blame] | 225 | int status; |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 226 | int fatal; |
| 227 | |
| 228 | spin_lock(&ras_log_buf_lock); |
| 229 | |
| 230 | status = rtas_call(ras_check_exception_token, 6, 1, NULL, |
Mark Nelson | b08e281 | 2010-05-26 21:40:39 +0000 | [diff] [blame] | 231 | RTAS_VECTOR_EXTERNAL_INTERRUPT, |
Grant Likely | 476eb49 | 2011-05-04 15:02:15 +1000 | [diff] [blame] | 232 | virq_to_hw(irq), |
Anton Blanchard | cc8b526 | 2012-03-21 15:58:03 +0000 | [diff] [blame] | 233 | RTAS_INTERNAL_ERROR, 1 /* Time Critical */, |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 234 | __pa(&ras_log_buf), |
| 235 | rtas_get_error_log_max()); |
| 236 | |
| 237 | rtas_elog = (struct rtas_error_log *)ras_log_buf; |
| 238 | |
Greg Kurz | a08a53ea | 2014-04-04 09:35:13 +0200 | [diff] [blame] | 239 | if (status == 0 && |
| 240 | rtas_error_severity(rtas_elog) >= RTAS_SEVERITY_ERROR_SYNC) |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 241 | fatal = 1; |
| 242 | else |
| 243 | fatal = 0; |
| 244 | |
| 245 | /* format and print the extended information */ |
| 246 | log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, fatal); |
| 247 | |
| 248 | if (fatal) { |
Anton Blanchard | cc8b526 | 2012-03-21 15:58:03 +0000 | [diff] [blame] | 249 | pr_emerg("Fatal hardware error reported by firmware"); |
| 250 | pr_emerg("Check RTAS error log for details"); |
| 251 | pr_emerg("Immediate power off"); |
| 252 | emergency_sync(); |
| 253 | kernel_power_off(); |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 254 | } else { |
Anton Blanchard | cc8b526 | 2012-03-21 15:58:03 +0000 | [diff] [blame] | 255 | pr_err("Recoverable hardware error reported by firmware"); |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 256 | } |
| 257 | |
| 258 | spin_unlock(&ras_log_buf_lock); |
| 259 | return IRQ_HANDLED; |
| 260 | } |
| 261 | |
Anton Blanchard | d368514 | 2011-01-11 19:50:51 +0000 | [diff] [blame] | 262 | /* |
| 263 | * Some versions of FWNMI place the buffer inside the 4kB page starting at |
| 264 | * 0x7000. Other versions place it inside the rtas buffer. We check both. |
| 265 | */ |
| 266 | #define VALID_FWNMI_BUFFER(A) \ |
| 267 | ((((A) >= 0x7000) && ((A) < 0x7ff0)) || \ |
| 268 | (((A) >= rtas.base) && ((A) < (rtas.base + rtas.size - 16)))) |
| 269 | |
| 270 | /* |
| 271 | * Get the error information for errors coming through the |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 272 | * FWNMI vectors. The pt_regs' r3 will be updated to reflect |
| 273 | * the actual r3 if possible, and a ptr to the error log entry |
| 274 | * will be returned if found. |
| 275 | * |
Anton Blanchard | d368514 | 2011-01-11 19:50:51 +0000 | [diff] [blame] | 276 | * If the RTAS error is not of the extended type, then we put it in a per |
| 277 | * cpu 64bit buffer. If it is the extended type we use global_mce_data_buf. |
| 278 | * |
| 279 | * The global_mce_data_buf does not have any locks or protection around it, |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 280 | * if a second machine check comes in, or a system reset is done |
| 281 | * before we have logged the error, then we will get corruption in the |
| 282 | * error log. This is preferable over holding off on calling |
| 283 | * ibm,nmi-interlock which would result in us checkstopping if a |
| 284 | * second machine check did come in. |
| 285 | */ |
| 286 | static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs) |
| 287 | { |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 288 | unsigned long *savep; |
Anton Blanchard | d368514 | 2011-01-11 19:50:51 +0000 | [diff] [blame] | 289 | struct rtas_error_log *h, *errhdr = NULL; |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 290 | |
Mahesh Salgaonkar | ee1dd1e | 2013-07-10 18:32:56 +0530 | [diff] [blame] | 291 | /* Mask top two bits */ |
| 292 | regs->gpr[3] &= ~(0x3UL << 62); |
| 293 | |
Anton Blanchard | d368514 | 2011-01-11 19:50:51 +0000 | [diff] [blame] | 294 | if (!VALID_FWNMI_BUFFER(regs->gpr[3])) { |
Anton Blanchard | f0e939a | 2011-05-10 13:34:03 +0000 | [diff] [blame] | 295 | printk(KERN_ERR "FWNMI: corrupt r3 0x%016lx\n", regs->gpr[3]); |
Anton Blanchard | d368514 | 2011-01-11 19:50:51 +0000 | [diff] [blame] | 296 | return NULL; |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 297 | } |
Anton Blanchard | d368514 | 2011-01-11 19:50:51 +0000 | [diff] [blame] | 298 | |
| 299 | savep = __va(regs->gpr[3]); |
| 300 | regs->gpr[3] = savep[0]; /* restore original r3 */ |
| 301 | |
| 302 | /* If it isn't an extended log we can use the per cpu 64bit buffer */ |
| 303 | h = (struct rtas_error_log *)&savep[1]; |
Greg Kurz | a08a53ea | 2014-04-04 09:35:13 +0200 | [diff] [blame] | 304 | if (!rtas_error_extended(h)) { |
Anton Blanchard | d368514 | 2011-01-11 19:50:51 +0000 | [diff] [blame] | 305 | memcpy(&__get_cpu_var(mce_data_buf), h, sizeof(__u64)); |
| 306 | errhdr = (struct rtas_error_log *)&__get_cpu_var(mce_data_buf); |
| 307 | } else { |
Greg Kurz | a08a53ea | 2014-04-04 09:35:13 +0200 | [diff] [blame] | 308 | int len, error_log_length; |
Anton Blanchard | d368514 | 2011-01-11 19:50:51 +0000 | [diff] [blame] | 309 | |
Greg Kurz | a08a53ea | 2014-04-04 09:35:13 +0200 | [diff] [blame] | 310 | error_log_length = 8 + rtas_error_extended_log_length(h); |
| 311 | len = max_t(int, error_log_length, RTAS_ERROR_LOG_MAX); |
Anton Blanchard | d368514 | 2011-01-11 19:50:51 +0000 | [diff] [blame] | 312 | memset(global_mce_data_buf, 0, RTAS_ERROR_LOG_MAX); |
| 313 | memcpy(global_mce_data_buf, h, len); |
| 314 | errhdr = (struct rtas_error_log *)global_mce_data_buf; |
| 315 | } |
| 316 | |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 317 | return errhdr; |
| 318 | } |
| 319 | |
| 320 | /* Call this when done with the data returned by FWNMI_get_errinfo. |
| 321 | * It will release the saved data area for other CPUs in the |
| 322 | * partition to receive FWNMI errors. |
| 323 | */ |
| 324 | static void fwnmi_release_errinfo(void) |
| 325 | { |
| 326 | int ret = rtas_call(rtas_token("ibm,nmi-interlock"), 0, 1, NULL); |
| 327 | if (ret != 0) |
Anton Blanchard | d368514 | 2011-01-11 19:50:51 +0000 | [diff] [blame] | 328 | printk(KERN_ERR "FWNMI: nmi-interlock failed: %d\n", ret); |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 329 | } |
| 330 | |
Arnd Bergmann | c902be7 | 2006-01-04 19:55:53 +0000 | [diff] [blame] | 331 | int pSeries_system_reset_exception(struct pt_regs *regs) |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 332 | { |
| 333 | if (fwnmi_active) { |
| 334 | struct rtas_error_log *errhdr = fwnmi_get_errinfo(regs); |
| 335 | if (errhdr) { |
| 336 | /* XXX Should look at FWNMI information */ |
| 337 | } |
| 338 | fwnmi_release_errinfo(); |
| 339 | } |
Arnd Bergmann | c902be7 | 2006-01-04 19:55:53 +0000 | [diff] [blame] | 340 | return 0; /* need to perform reset */ |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 341 | } |
| 342 | |
| 343 | /* |
| 344 | * See if we can recover from a machine check exception. |
| 345 | * This is only called on power4 (or above) and only via |
| 346 | * the Firmware Non-Maskable Interrupts (fwnmi) handler |
| 347 | * which provides the error analysis for us. |
| 348 | * |
| 349 | * Return 1 if corrected (or delivered a signal). |
| 350 | * Return 0 if there is nothing we can do. |
| 351 | */ |
Anton Blanchard | d47d1d8 | 2011-01-11 19:49:19 +0000 | [diff] [blame] | 352 | static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err) |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 353 | { |
Anton Blanchard | d47d1d8 | 2011-01-11 19:49:19 +0000 | [diff] [blame] | 354 | int recovered = 0; |
Greg Kurz | a08a53ea | 2014-04-04 09:35:13 +0200 | [diff] [blame] | 355 | int disposition = rtas_error_disposition(err); |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 356 | |
Anton Blanchard | d47d1d8 | 2011-01-11 19:49:19 +0000 | [diff] [blame] | 357 | if (!(regs->msr & MSR_RI)) { |
| 358 | /* If MSR_RI isn't set, we cannot recover */ |
| 359 | recovered = 0; |
| 360 | |
Greg Kurz | a08a53ea | 2014-04-04 09:35:13 +0200 | [diff] [blame] | 361 | } else if (disposition == RTAS_DISP_FULLY_RECOVERED) { |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 362 | /* Platform corrected itself */ |
Anton Blanchard | d47d1d8 | 2011-01-11 19:49:19 +0000 | [diff] [blame] | 363 | recovered = 1; |
| 364 | |
Greg Kurz | a08a53ea | 2014-04-04 09:35:13 +0200 | [diff] [blame] | 365 | } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) { |
Anton Blanchard | d47d1d8 | 2011-01-11 19:49:19 +0000 | [diff] [blame] | 366 | /* Platform corrected itself but could be degraded */ |
| 367 | printk(KERN_ERR "MCE: limited recovery, system may " |
| 368 | "be degraded\n"); |
| 369 | recovered = 1; |
| 370 | |
| 371 | } else if (user_mode(regs) && !is_global_init(current) && |
Greg Kurz | a08a53ea | 2014-04-04 09:35:13 +0200 | [diff] [blame] | 372 | rtas_error_severity(err) == RTAS_SEVERITY_ERROR_SYNC) { |
Anton Blanchard | d47d1d8 | 2011-01-11 19:49:19 +0000 | [diff] [blame] | 373 | |
| 374 | /* |
| 375 | * If we received a synchronous error when in userspace |
| 376 | * kill the task. Firmware may report details of the fail |
| 377 | * asynchronously, so we can't rely on the target and type |
| 378 | * fields being valid here. |
| 379 | */ |
| 380 | printk(KERN_ERR "MCE: uncorrectable error, killing task " |
| 381 | "%s:%d\n", current->comm, current->pid); |
| 382 | |
| 383 | _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); |
| 384 | recovered = 1; |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 385 | } |
| 386 | |
Anton Blanchard | 3f9793e | 2011-01-11 19:46:29 +0000 | [diff] [blame] | 387 | log_error((char *)err, ERR_TYPE_RTAS_LOG, 0); |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 388 | |
Anton Blanchard | d47d1d8 | 2011-01-11 19:49:19 +0000 | [diff] [blame] | 389 | return recovered; |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 390 | } |
| 391 | |
| 392 | /* |
| 393 | * Handle a machine check. |
| 394 | * |
| 395 | * Note that on Power 4 and beyond Firmware Non-Maskable Interrupts (fwnmi) |
| 396 | * should be present. If so the handler which called us tells us if the |
| 397 | * error was recovered (never true if RI=0). |
| 398 | * |
| 399 | * On hardware prior to Power 4 these exceptions were asynchronous which |
| 400 | * means we can't tell exactly where it occurred and so we can't recover. |
| 401 | */ |
| 402 | int pSeries_machine_check_exception(struct pt_regs *regs) |
| 403 | { |
| 404 | struct rtas_error_log *errp; |
| 405 | |
| 406 | if (fwnmi_active) { |
| 407 | errp = fwnmi_get_errinfo(regs); |
| 408 | fwnmi_release_errinfo(); |
| 409 | if (errp && recover_mce(regs, errp)) |
| 410 | return 1; |
| 411 | } |
| 412 | |
| 413 | return 0; |
| 414 | } |