blob: 1bde4514107323e494393967f924b5b2a5613ac0 [file] [log] [blame]
Mauro Carvalho Chehab77c5f5d2013-02-15 06:11:57 -03001/*
2 * GHES/EDAC Linux driver
3 *
4 * This file may be distributed under the terms of the GNU General Public
5 * License version 2.
6 *
7 * Copyright (c) 2013 by Mauro Carvalho Chehab <mchehab@redhat.com>
8 *
9 * Red Hat Inc. http://www.redhat.com
10 */
11
Mauro Carvalho Chehabd2a68562013-02-15 09:06:38 -030012#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13
Mauro Carvalho Chehab77c5f5d2013-02-15 06:11:57 -030014#include <acpi/ghes.h>
15#include <linux/edac.h>
Mauro Carvalho Chehab32fa1f52013-02-14 09:11:08 -030016#include <linux/dmi.h>
Mauro Carvalho Chehab77c5f5d2013-02-15 06:11:57 -030017#include "edac_core.h"
18
Mauro Carvalho Chehab77c5f5d2013-02-15 06:11:57 -030019#define GHES_EDAC_REVISION " Ver: 1.0.0"
20
21struct ghes_edac_pvt {
22 struct list_head list;
23 struct ghes *ghes;
24 struct mem_ctl_info *mci;
Mauro Carvalho Chehab689c9cd2013-02-19 19:24:12 -030025
26 /* Buffers for the error handling routine */
27 char other_detail[160];
28 char msg[80];
Mauro Carvalho Chehab77c5f5d2013-02-15 06:11:57 -030029};
30
31static LIST_HEAD(ghes_reglist);
32static DEFINE_MUTEX(ghes_edac_lock);
33static int ghes_edac_mc_num;
34
Mauro Carvalho Chehabd2a68562013-02-15 09:06:38 -030035
Mauro Carvalho Chehab32fa1f52013-02-14 09:11:08 -030036/* Memory Device - Type 17 of SMBIOS spec */
37struct memdev_dmi_entry {
38 u8 type;
39 u8 length;
40 u16 handle;
41 u16 phys_mem_array_handle;
42 u16 mem_err_info_handle;
43 u16 total_width;
44 u16 data_width;
45 u16 size;
46 u8 form_factor;
47 u8 device_set;
48 u8 device_locator;
49 u8 bank_locator;
50 u8 memory_type;
51 u16 type_detail;
52 u16 speed;
53 u8 manufacturer;
54 u8 serial_number;
55 u8 asset_tag;
56 u8 part_number;
57 u8 attributes;
58 u32 extended_size;
59 u16 conf_mem_clk_speed;
60} __attribute__((__packed__));
61
62struct ghes_edac_dimm_fill {
63 struct mem_ctl_info *mci;
64 unsigned count;
65};
66
67char *memory_type[] = {
68 [MEM_EMPTY] = "EMPTY",
69 [MEM_RESERVED] = "RESERVED",
70 [MEM_UNKNOWN] = "UNKNOWN",
71 [MEM_FPM] = "FPM",
72 [MEM_EDO] = "EDO",
73 [MEM_BEDO] = "BEDO",
74 [MEM_SDR] = "SDR",
75 [MEM_RDR] = "RDR",
76 [MEM_DDR] = "DDR",
77 [MEM_RDDR] = "RDDR",
78 [MEM_RMBS] = "RMBS",
79 [MEM_DDR2] = "DDR2",
80 [MEM_FB_DDR2] = "FB_DDR2",
81 [MEM_RDDR2] = "RDDR2",
82 [MEM_XDR] = "XDR",
83 [MEM_DDR3] = "DDR3",
84 [MEM_RDDR3] = "RDDR3",
85};
86
87static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg)
88{
89 int *num_dimm = arg;
90
91 if (dh->type == DMI_ENTRY_MEM_DEVICE)
92 (*num_dimm)++;
93}
94
95static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
96{
97 struct ghes_edac_dimm_fill *dimm_fill = arg;
98 struct mem_ctl_info *mci = dimm_fill->mci;
99
100 if (dh->type == DMI_ENTRY_MEM_DEVICE) {
101 struct memdev_dmi_entry *entry = (struct memdev_dmi_entry *)dh;
102 struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
103 mci->n_layers,
104 dimm_fill->count, 0, 0);
105
106 if (entry->size == 0xffff) {
Mauro Carvalho Chehabd2a68562013-02-15 09:06:38 -0300107 pr_info("Can't get DIMM%i size\n",
108 dimm_fill->count);
Mauro Carvalho Chehab32fa1f52013-02-14 09:11:08 -0300109 dimm->nr_pages = MiB_TO_PAGES(32);/* Unknown */
110 } else if (entry->size == 0x7fff) {
111 dimm->nr_pages = MiB_TO_PAGES(entry->extended_size);
112 } else {
113 if (entry->size & 1 << 15)
114 dimm->nr_pages = MiB_TO_PAGES((entry->size &
115 0x7fff) << 10);
116 else
117 dimm->nr_pages = MiB_TO_PAGES(entry->size);
118 }
119
120 switch (entry->memory_type) {
121 case 0x12:
122 if (entry->type_detail & 1 << 13)
123 dimm->mtype = MEM_RDDR;
124 else
125 dimm->mtype = MEM_DDR;
126 break;
127 case 0x13:
128 if (entry->type_detail & 1 << 13)
129 dimm->mtype = MEM_RDDR2;
130 else
131 dimm->mtype = MEM_DDR2;
132 break;
133 case 0x14:
134 dimm->mtype = MEM_FB_DDR2;
135 break;
136 case 0x18:
137 if (entry->type_detail & 1 << 13)
138 dimm->mtype = MEM_RDDR3;
139 else
140 dimm->mtype = MEM_DDR3;
141 break;
142 default:
143 if (entry->type_detail & 1 << 6)
144 dimm->mtype = MEM_RMBS;
145 else if ((entry->type_detail & ((1 << 7) | (1 << 13)))
146 == ((1 << 7) | (1 << 13)))
147 dimm->mtype = MEM_RDR;
148 else if (entry->type_detail & 1 << 7)
149 dimm->mtype = MEM_SDR;
150 else if (entry->type_detail & 1 << 9)
151 dimm->mtype = MEM_EDO;
152 else
153 dimm->mtype = MEM_UNKNOWN;
154 }
155
156 /*
157 * Actually, we can only detect if the memory has bits for
158 * checksum or not
159 */
160 if (entry->total_width == entry->data_width)
161 dimm->edac_mode = EDAC_NONE;
162 else
163 dimm->edac_mode = EDAC_SECDED;
164
165 dimm->dtype = DEV_UNKNOWN;
166 dimm->grain = 128; /* Likely, worse case */
167
168 /*
169 * FIXME: It shouldn't be hard to also fill the DIMM labels
170 */
171
172 if (dimm->nr_pages) {
Mauro Carvalho Chehabd2a68562013-02-15 09:06:38 -0300173 edac_dbg(1, "DIMM%i: %s size = %d MB%s\n",
Mauro Carvalho Chehab32fa1f52013-02-14 09:11:08 -0300174 dimm_fill->count, memory_type[dimm->mtype],
175 PAGES_TO_MiB(dimm->nr_pages),
176 (dimm->edac_mode != EDAC_NONE) ? "(ECC)" : "");
Mauro Carvalho Chehabd2a68562013-02-15 09:06:38 -0300177 edac_dbg(2, "\ttype %d, detail 0x%02x, width %d(total %d)\n",
Mauro Carvalho Chehab32fa1f52013-02-14 09:11:08 -0300178 entry->memory_type, entry->type_detail,
179 entry->total_width, entry->data_width);
180 }
181
182 dimm_fill->count++;
183 }
184}
185
Mauro Carvalho Chehab77c5f5d2013-02-15 06:11:57 -0300186void ghes_edac_report_mem_error(struct ghes *ghes, int sev,
Mauro Carvalho Chehabf04c62a2013-02-15 06:36:27 -0300187 struct cper_sec_mem_err *mem_err)
Mauro Carvalho Chehab77c5f5d2013-02-15 06:11:57 -0300188{
Mauro Carvalho Chehabf04c62a2013-02-15 06:36:27 -0300189 enum hw_event_mc_err_type type;
190 struct edac_raw_error_desc *e;
191 struct mem_ctl_info *mci;
192 struct ghes_edac_pvt *pvt = NULL;
Mauro Carvalho Chehab689c9cd2013-02-19 19:24:12 -0300193 char *p;
Mauro Carvalho Chehabf04c62a2013-02-15 06:36:27 -0300194
195 list_for_each_entry(pvt, &ghes_reglist, list) {
196 if (ghes == pvt->ghes)
197 break;
198 }
199 if (!pvt) {
200 pr_err("Internal error: Can't find EDAC structure\n");
201 return;
202 }
203 mci = pvt->mci;
204 e = &mci->error_desc;
205
206 /* Cleans the error report buffer */
207 memset(e, 0, sizeof (*e));
208 e->error_count = 1;
Mauro Carvalho Chehab689c9cd2013-02-19 19:24:12 -0300209 strcpy(e->label, "unknown label");
210 e->msg = pvt->msg;
211 e->other_detail = pvt->other_detail;
212 e->top_layer = -1;
213 e->mid_layer = -1;
214 e->low_layer = -1;
215 *pvt->other_detail = '\0';
216 *pvt->msg = '\0';
Mauro Carvalho Chehabf04c62a2013-02-15 06:36:27 -0300217
218 switch (sev) {
219 case GHES_SEV_CORRECTED:
220 type = HW_EVENT_ERR_CORRECTED;
221 break;
222 case GHES_SEV_RECOVERABLE:
223 type = HW_EVENT_ERR_UNCORRECTED;
224 break;
225 case GHES_SEV_PANIC:
226 type = HW_EVENT_ERR_FATAL;
227 break;
228 default:
229 case GHES_SEV_NO:
230 type = HW_EVENT_ERR_INFO;
231 }
232
Mauro Carvalho Chehab689c9cd2013-02-19 19:24:12 -0300233 edac_dbg(1, "error validation_bits: 0x%08llx\n",
234 (long long)mem_err->validation_bits);
235
236 /* Error type, mapped on e->msg */
237 if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_TYPE) {
238 p = pvt->msg;
239 switch (mem_err->error_type) {
240 case 0:
241 p += sprintf(p, "Unknown");
242 break;
243 case 1:
244 p += sprintf(p, "No error");
245 break;
246 case 2:
247 p += sprintf(p, "Single-bit ECC");
248 break;
249 case 3:
250 p += sprintf(p, "Multi-bit ECC");
251 break;
252 case 4:
253 p += sprintf(p, "Single-symbol ChipKill ECC");
254 break;
255 case 5:
256 p += sprintf(p, "Multi-symbol ChipKill ECC");
257 break;
258 case 6:
259 p += sprintf(p, "Master abort");
260 break;
261 case 7:
262 p += sprintf(p, "Target abort");
263 break;
264 case 8:
265 p += sprintf(p, "Parity Error");
266 break;
267 case 9:
268 p += sprintf(p, "Watchdog timeout");
269 break;
270 case 10:
271 p += sprintf(p, "Invalid address");
272 break;
273 case 11:
274 p += sprintf(p, "Mirror Broken");
275 break;
276 case 12:
277 p += sprintf(p, "Memory Sparing");
278 break;
279 case 13:
280 p += sprintf(p, "Scrub corrected error");
281 break;
282 case 14:
283 p += sprintf(p, "Scrub uncorrected error");
284 break;
285 case 15:
286 p += sprintf(p, "Physical Memory Map-out event");
287 break;
288 default:
289 p += sprintf(p, "reserved error (%d)",
290 mem_err->error_type);
291 }
292 } else {
293 strcpy(pvt->msg, "unknown error");
294 }
295
296 /* Error address */
297 if (mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS) {
298 e->page_frame_number = mem_err->physical_addr >> PAGE_SHIFT;
299 e->offset_in_page = mem_err->physical_addr & ~PAGE_MASK;
300 }
301
302 /* Error grain */
303 if (mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS_MASK) {
304 e->grain = ~(mem_err->physical_addr_mask & ~PAGE_MASK);
305 }
306
307 /* Memory error location, mapped on e->location */
308 p = e->location;
309 if (mem_err->validation_bits & CPER_MEM_VALID_NODE)
310 p += sprintf(p, "node:%d ", mem_err->node);
311 if (mem_err->validation_bits & CPER_MEM_VALID_CARD)
312 p += sprintf(p, "card:%d ", mem_err->card);
313 if (mem_err->validation_bits & CPER_MEM_VALID_MODULE)
314 p += sprintf(p, "module:%d ", mem_err->module);
315 if (mem_err->validation_bits & CPER_MEM_VALID_BANK)
316 p += sprintf(p, "bank:%d ", mem_err->bank);
317 if (mem_err->validation_bits & CPER_MEM_VALID_ROW)
318 p += sprintf(p, "row:%d ", mem_err->row);
319 if (mem_err->validation_bits & CPER_MEM_VALID_COLUMN)
320 p += sprintf(p, "col:%d ", mem_err->column);
321 if (mem_err->validation_bits & CPER_MEM_VALID_BIT_POSITION)
322 p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos);
323 if (p > e->location)
324 *(p - 1) = '\0';
325
326 /* All other fields are mapped on e->other_detail */
327 p = pvt->other_detail;
328 if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_STATUS) {
329 u64 status = mem_err->error_status;
330
331 p += sprintf(p, "status(0x%016llx): ", (long long)status);
332 switch ((status >> 8) & 0xff) {
333 case 1:
334 p += sprintf(p, "Error detected internal to the component ");
335 break;
336 case 16:
337 p += sprintf(p, "Error detected in the bus ");
338 break;
339 case 4:
340 p += sprintf(p, "Storage error in DRAM memory ");
341 break;
342 case 5:
343 p += sprintf(p, "Storage error in TLB ");
344 break;
345 case 6:
346 p += sprintf(p, "Storage error in cache ");
347 break;
348 case 7:
349 p += sprintf(p, "Error in one or more functional units ");
350 break;
351 case 8:
352 p += sprintf(p, "component failed self test ");
353 break;
354 case 9:
355 p += sprintf(p, "Overflow or undervalue of internal queue ");
356 break;
357 case 17:
358 p += sprintf(p, "Virtual address not found on IO-TLB or IO-PDIR ");
359 break;
360 case 18:
361 p += sprintf(p, "Improper access error ");
362 break;
363 case 19:
364 p += sprintf(p, "Access to a memory address which is not mapped to any component ");
365 break;
366 case 20:
367 p += sprintf(p, "Loss of Lockstep ");
368 break;
369 case 21:
370 p += sprintf(p, "Response not associated with a request ");
371 break;
372 case 22:
373 p += sprintf(p, "Bus parity error - must also set the A, C, or D Bits ");
374 break;
375 case 23:
376 p += sprintf(p, "Detection of a PATH_ERROR ");
377 break;
378 case 25:
379 p += sprintf(p, "Bus operation timeout ");
380 break;
381 case 26:
382 p += sprintf(p, "A read was issued to data that has been poisoned ");
383 break;
384 default:
385 p += sprintf(p, "reserved ");
386 break;
387 }
388 }
389 if (mem_err->validation_bits & CPER_MEM_VALID_REQUESTOR_ID)
390 p += sprintf(p, "requestorID: 0x%016llx ",
391 (long long)mem_err->requestor_id);
392 if (mem_err->validation_bits & CPER_MEM_VALID_RESPONDER_ID)
393 p += sprintf(p, "responderID: 0x%016llx ",
394 (long long)mem_err->responder_id);
395 if (mem_err->validation_bits & CPER_MEM_VALID_TARGET_ID)
396 p += sprintf(p, "targetID: 0x%016llx ",
397 (long long)mem_err->responder_id);
398 if (p > pvt->other_detail)
399 *(p - 1) = '\0';
Mauro Carvalho Chehabf04c62a2013-02-15 06:36:27 -0300400
401 edac_raw_mc_handle_error(type, mci, e);
Mauro Carvalho Chehab77c5f5d2013-02-15 06:11:57 -0300402}
403EXPORT_SYMBOL_GPL(ghes_edac_report_mem_error);
404
405int ghes_edac_register(struct ghes *ghes, struct device *dev)
406{
Mauro Carvalho Chehab32fa1f52013-02-14 09:11:08 -0300407 bool fake = false;
408 int rc, num_dimm = 0;
Mauro Carvalho Chehab77c5f5d2013-02-15 06:11:57 -0300409 struct mem_ctl_info *mci;
410 struct edac_mc_layer layers[1];
Mauro Carvalho Chehab77c5f5d2013-02-15 06:11:57 -0300411 struct ghes_edac_pvt *pvt;
Mauro Carvalho Chehab32fa1f52013-02-14 09:11:08 -0300412 struct ghes_edac_dimm_fill dimm_fill;
413
414 /* Get the number of DIMMs */
415 dmi_walk(ghes_edac_count_dimms, &num_dimm);
416
417 /* Check if we've got a bogus BIOS */
418 if (num_dimm == 0) {
419 fake = true;
420 num_dimm = 1;
421 }
Mauro Carvalho Chehab77c5f5d2013-02-15 06:11:57 -0300422
423 layers[0].type = EDAC_MC_LAYER_ALL_MEM;
Mauro Carvalho Chehab32fa1f52013-02-14 09:11:08 -0300424 layers[0].size = num_dimm;
Mauro Carvalho Chehab77c5f5d2013-02-15 06:11:57 -0300425 layers[0].is_virt_csrow = true;
426
427 /*
428 * We need to serialize edac_mc_alloc() and edac_mc_add_mc(),
429 * to avoid duplicated memory controller numbers
430 */
431 mutex_lock(&ghes_edac_lock);
432 mci = edac_mc_alloc(ghes_edac_mc_num, ARRAY_SIZE(layers), layers,
433 sizeof(*pvt));
434 if (!mci) {
Mauro Carvalho Chehabd2a68562013-02-15 09:06:38 -0300435 pr_info("Can't allocate memory for EDAC data\n");
Mauro Carvalho Chehab77c5f5d2013-02-15 06:11:57 -0300436 mutex_unlock(&ghes_edac_lock);
437 return -ENOMEM;
438 }
439
440 pvt = mci->pvt_info;
441 memset(pvt, 0, sizeof(*pvt));
Mauro Carvalho Chehabf04c62a2013-02-15 06:36:27 -0300442 list_add_tail(&pvt->list, &ghes_reglist);
Mauro Carvalho Chehab77c5f5d2013-02-15 06:11:57 -0300443 pvt->ghes = ghes;
444 pvt->mci = mci;
445 mci->pdev = dev;
446
447 mci->mtype_cap = MEM_FLAG_EMPTY;
448 mci->edac_ctl_cap = EDAC_FLAG_NONE;
449 mci->edac_cap = EDAC_FLAG_NONE;
450 mci->mod_name = "ghes_edac.c";
451 mci->mod_ver = GHES_EDAC_REVISION;
452 mci->ctl_name = "ghes_edac";
453 mci->dev_name = "ghes";
454
Mauro Carvalho Chehabd2a68562013-02-15 09:06:38 -0300455 if (!ghes_edac_mc_num) {
456 if (!fake) {
457 pr_info("This EDAC driver relies on BIOS to enumerate memory and get error reports.\n");
458 pr_info("Unfortunately, not all BIOSes reflect the memory layout correctly.\n");
459 pr_info("So, the end result of using this driver varies from vendor to vendor.\n");
460 pr_info("If you find incorrect reports, please contact your hardware vendor\n");
461 pr_info("to correct its BIOS.\n");
462 pr_info("This system has %d DIMM sockets.\n",
463 num_dimm);
464 } else {
465 pr_info("This system has a very crappy BIOS: It doesn't even list the DIMMS.\n");
466 pr_info("Its SMBIOS info is wrong. It is doubtful that the error report would\n");
467 pr_info("work on such system. Use this driver with caution\n");
468 }
469 }
470
Mauro Carvalho Chehab32fa1f52013-02-14 09:11:08 -0300471 if (!fake) {
Mauro Carvalho Chehab5ee726d2013-02-15 08:45:00 -0300472 /*
473 * Fill DIMM info from DMI for the memory controller #0
474 *
475 * Keep it in blank for the other memory controllers, as
476 * there's no reliable way to properly credit each DIMM to
477 * the memory controller, as different BIOSes fill the
478 * DMI bank location fields on different ways
479 */
480 if (!ghes_edac_mc_num) {
481 dimm_fill.count = 0;
482 dimm_fill.mci = mci;
483 dmi_walk(ghes_edac_dmidecode, &dimm_fill);
484 }
Mauro Carvalho Chehab32fa1f52013-02-14 09:11:08 -0300485 } else {
486 struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
487 mci->n_layers, 0, 0, 0);
Mauro Carvalho Chehab77c5f5d2013-02-15 06:11:57 -0300488
Mauro Carvalho Chehabd2a68562013-02-15 09:06:38 -0300489 dimm->nr_pages = 1;
Mauro Carvalho Chehab32fa1f52013-02-14 09:11:08 -0300490 dimm->grain = 128;
491 dimm->mtype = MEM_UNKNOWN;
492 dimm->dtype = DEV_UNKNOWN;
493 dimm->edac_mode = EDAC_SECDED;
494 }
Mauro Carvalho Chehab77c5f5d2013-02-15 06:11:57 -0300495
496 rc = edac_mc_add_mc(mci);
497 if (rc < 0) {
Mauro Carvalho Chehabd2a68562013-02-15 09:06:38 -0300498 pr_info("Can't register at EDAC core\n");
Mauro Carvalho Chehab77c5f5d2013-02-15 06:11:57 -0300499 edac_mc_free(mci);
500 mutex_unlock(&ghes_edac_lock);
501 return -ENODEV;
502 }
503
504 ghes_edac_mc_num++;
505 mutex_unlock(&ghes_edac_lock);
506 return 0;
507}
508EXPORT_SYMBOL_GPL(ghes_edac_register);
509
510void ghes_edac_unregister(struct ghes *ghes)
511{
512 struct mem_ctl_info *mci;
513 struct ghes_edac_pvt *pvt;
514
515 list_for_each_entry(pvt, &ghes_reglist, list) {
516 if (ghes == pvt->ghes) {
517 mci = pvt->mci;
518 edac_mc_del_mc(mci->pdev);
519 edac_mc_free(mci);
520 list_del(&pvt->list);
521 }
522 }
523}
524EXPORT_SYMBOL_GPL(ghes_edac_unregister);