blob: ba317e2930a19c78741f93379d6d3d259821b99a [file] [log] [blame]
Dave Jiangc0d12172007-07-19 01:49:46 -07001/*
2 * Generic EDAC defs
3 *
4 * Author: Dave Jiang <djiang@mvista.com>
5 *
Hitoshi Mitakec3c52bc2008-04-29 01:03:18 -07006 * 2006-2008 (c) MontaVista Software, Inc. This file is licensed under
Dave Jiangc0d12172007-07-19 01:49:46 -07007 * the terms of the GNU General Public License version 2. This program
8 * is licensed "as is" without any warranty of any kind, whether express
9 * or implied.
10 *
11 */
12#ifndef _LINUX_EDAC_H_
13#define _LINUX_EDAC_H_
14
Arun Sharma600634972011-07-26 16:09:06 -070015#include <linux/atomic.h>
Paul Gortmaker313162d2012-01-30 11:46:54 -050016#include <linux/kobject.h>
17#include <linux/completion.h>
18#include <linux/workqueue.h>
19
20struct device;
Dave Jiangc0d12172007-07-19 01:49:46 -070021
22#define EDAC_OPSTATE_INVAL -1
23#define EDAC_OPSTATE_POLL 0
24#define EDAC_OPSTATE_NMI 1
25#define EDAC_OPSTATE_INT 2
26
27extern int edac_op_state;
Dave Jiang66ee2f92007-07-19 01:49:54 -070028extern int edac_err_assert;
Dave Jiangc0d12172007-07-19 01:49:46 -070029extern atomic_t edac_handlers;
Kay Sieversfe5ff8b2011-12-14 15:21:07 -080030extern struct bus_type edac_subsys;
Dave Jiangc0d12172007-07-19 01:49:46 -070031
32extern int edac_handler_set(void);
33extern void edac_atomic_assert_error(void);
Kay Sieversfe5ff8b2011-12-14 15:21:07 -080034extern struct bus_type *edac_get_sysfs_subsys(void);
35extern void edac_put_sysfs_subsys(void);
Dave Jiangc0d12172007-07-19 01:49:46 -070036
Hitoshi Mitakec3c52bc2008-04-29 01:03:18 -070037static inline void opstate_init(void)
38{
39 switch (edac_op_state) {
40 case EDAC_OPSTATE_POLL:
41 case EDAC_OPSTATE_NMI:
42 break;
43 default:
44 edac_op_state = EDAC_OPSTATE_POLL;
45 }
46 return;
47}
48
Mauro Carvalho Chehabddeb3542011-03-04 15:11:29 -030049#define EDAC_MC_LABEL_LEN 31
50#define MC_PROC_NAME_MAX_LEN 7
51
52/* memory devices */
53enum dev_type {
54 DEV_UNKNOWN = 0,
55 DEV_X1,
56 DEV_X2,
57 DEV_X4,
58 DEV_X8,
59 DEV_X16,
60 DEV_X32, /* Do these parts exist? */
61 DEV_X64 /* Do these parts exist? */
62};
63
64#define DEV_FLAG_UNKNOWN BIT(DEV_UNKNOWN)
65#define DEV_FLAG_X1 BIT(DEV_X1)
66#define DEV_FLAG_X2 BIT(DEV_X2)
67#define DEV_FLAG_X4 BIT(DEV_X4)
68#define DEV_FLAG_X8 BIT(DEV_X8)
69#define DEV_FLAG_X16 BIT(DEV_X16)
70#define DEV_FLAG_X32 BIT(DEV_X32)
71#define DEV_FLAG_X64 BIT(DEV_X64)
72
73/* memory types */
74enum mem_type {
75 MEM_EMPTY = 0, /* Empty csrow */
76 MEM_RESERVED, /* Reserved csrow type */
77 MEM_UNKNOWN, /* Unknown csrow type */
78 MEM_FPM, /* Fast page mode */
79 MEM_EDO, /* Extended data out */
80 MEM_BEDO, /* Burst Extended data out */
81 MEM_SDR, /* Single data rate SDRAM */
82 MEM_RDR, /* Registered single data rate SDRAM */
83 MEM_DDR, /* Double data rate SDRAM */
84 MEM_RDDR, /* Registered Double data rate SDRAM */
85 MEM_RMBS, /* Rambus DRAM */
86 MEM_DDR2, /* DDR2 RAM */
87 MEM_FB_DDR2, /* fully buffered DDR2 */
88 MEM_RDDR2, /* Registered DDR2 RAM */
89 MEM_XDR, /* Rambus XDR */
90 MEM_DDR3, /* DDR3 RAM */
91 MEM_RDDR3, /* Registered DDR3 RAM */
92};
93
94#define MEM_FLAG_EMPTY BIT(MEM_EMPTY)
95#define MEM_FLAG_RESERVED BIT(MEM_RESERVED)
96#define MEM_FLAG_UNKNOWN BIT(MEM_UNKNOWN)
97#define MEM_FLAG_FPM BIT(MEM_FPM)
98#define MEM_FLAG_EDO BIT(MEM_EDO)
99#define MEM_FLAG_BEDO BIT(MEM_BEDO)
100#define MEM_FLAG_SDR BIT(MEM_SDR)
101#define MEM_FLAG_RDR BIT(MEM_RDR)
102#define MEM_FLAG_DDR BIT(MEM_DDR)
103#define MEM_FLAG_RDDR BIT(MEM_RDDR)
104#define MEM_FLAG_RMBS BIT(MEM_RMBS)
105#define MEM_FLAG_DDR2 BIT(MEM_DDR2)
106#define MEM_FLAG_FB_DDR2 BIT(MEM_FB_DDR2)
107#define MEM_FLAG_RDDR2 BIT(MEM_RDDR2)
108#define MEM_FLAG_XDR BIT(MEM_XDR)
109#define MEM_FLAG_DDR3 BIT(MEM_DDR3)
110#define MEM_FLAG_RDDR3 BIT(MEM_RDDR3)
111
112/* chipset Error Detection and Correction capabilities and mode */
113enum edac_type {
114 EDAC_UNKNOWN = 0, /* Unknown if ECC is available */
115 EDAC_NONE, /* Doesn't support ECC */
116 EDAC_RESERVED, /* Reserved ECC type */
117 EDAC_PARITY, /* Detects parity errors */
118 EDAC_EC, /* Error Checking - no correction */
119 EDAC_SECDED, /* Single bit error correction, Double detection */
120 EDAC_S2ECD2ED, /* Chipkill x2 devices - do these exist? */
121 EDAC_S4ECD4ED, /* Chipkill x4 devices */
122 EDAC_S8ECD8ED, /* Chipkill x8 devices */
123 EDAC_S16ECD16ED, /* Chipkill x16 devices */
124};
125
126#define EDAC_FLAG_UNKNOWN BIT(EDAC_UNKNOWN)
127#define EDAC_FLAG_NONE BIT(EDAC_NONE)
128#define EDAC_FLAG_PARITY BIT(EDAC_PARITY)
129#define EDAC_FLAG_EC BIT(EDAC_EC)
130#define EDAC_FLAG_SECDED BIT(EDAC_SECDED)
131#define EDAC_FLAG_S2ECD2ED BIT(EDAC_S2ECD2ED)
132#define EDAC_FLAG_S4ECD4ED BIT(EDAC_S4ECD4ED)
133#define EDAC_FLAG_S8ECD8ED BIT(EDAC_S8ECD8ED)
134#define EDAC_FLAG_S16ECD16ED BIT(EDAC_S16ECD16ED)
135
136/* scrubbing capabilities */
137enum scrub_type {
138 SCRUB_UNKNOWN = 0, /* Unknown if scrubber is available */
139 SCRUB_NONE, /* No scrubber */
140 SCRUB_SW_PROG, /* SW progressive (sequential) scrubbing */
141 SCRUB_SW_SRC, /* Software scrub only errors */
142 SCRUB_SW_PROG_SRC, /* Progressive software scrub from an error */
143 SCRUB_SW_TUNABLE, /* Software scrub frequency is tunable */
144 SCRUB_HW_PROG, /* HW progressive (sequential) scrubbing */
145 SCRUB_HW_SRC, /* Hardware scrub only errors */
146 SCRUB_HW_PROG_SRC, /* Progressive hardware scrub from an error */
147 SCRUB_HW_TUNABLE /* Hardware scrub frequency is tunable */
148};
149
150#define SCRUB_FLAG_SW_PROG BIT(SCRUB_SW_PROG)
151#define SCRUB_FLAG_SW_SRC BIT(SCRUB_SW_SRC)
152#define SCRUB_FLAG_SW_PROG_SRC BIT(SCRUB_SW_PROG_SRC)
153#define SCRUB_FLAG_SW_TUN BIT(SCRUB_SW_SCRUB_TUNABLE)
154#define SCRUB_FLAG_HW_PROG BIT(SCRUB_HW_PROG)
155#define SCRUB_FLAG_HW_SRC BIT(SCRUB_HW_SRC)
156#define SCRUB_FLAG_HW_PROG_SRC BIT(SCRUB_HW_PROG_SRC)
157#define SCRUB_FLAG_HW_TUN BIT(SCRUB_HW_TUNABLE)
158
159/* FIXME - should have notify capabilities: NMI, LOG, PROC, etc */
160
161/* EDAC internal operation states */
162#define OP_ALLOC 0x100
163#define OP_RUNNING_POLL 0x201
164#define OP_RUNNING_INTERRUPT 0x202
165#define OP_RUNNING_POLL_INTR 0x203
166#define OP_OFFLINE 0x300
167
168/*
169 * There are several things to be aware of that aren't at all obvious:
170 *
171 *
172 * SOCKETS, SOCKET SETS, BANKS, ROWS, CHIP-SELECT ROWS, CHANNELS, etc..
173 *
174 * These are some of the many terms that are thrown about that don't always
175 * mean what people think they mean (Inconceivable!). In the interest of
176 * creating a common ground for discussion, terms and their definitions
177 * will be established.
178 *
179 * Memory devices: The individual chip on a memory stick. These devices
180 * commonly output 4 and 8 bits each. Grouping several
181 * of these in parallel provides 64 bits which is common
182 * for a memory stick.
183 *
184 * Memory Stick: A printed circuit board that aggregates multiple
185 * memory devices in parallel. This is the atomic
186 * memory component that is purchaseable by Joe consumer
187 * and loaded into a memory socket.
188 *
189 * Socket: A physical connector on the motherboard that accepts
190 * a single memory stick.
191 *
192 * Channel: Set of memory devices on a memory stick that must be
193 * grouped in parallel with one or more additional
194 * channels from other memory sticks. This parallel
195 * grouping of the output from multiple channels are
196 * necessary for the smallest granularity of memory access.
197 * Some memory controllers are capable of single channel -
198 * which means that memory sticks can be loaded
199 * individually. Other memory controllers are only
200 * capable of dual channel - which means that memory
201 * sticks must be loaded as pairs (see "socket set").
202 *
203 * Chip-select row: All of the memory devices that are selected together.
204 * for a single, minimum grain of memory access.
205 * This selects all of the parallel memory devices across
206 * all of the parallel channels. Common chip-select rows
207 * for single channel are 64 bits, for dual channel 128
208 * bits.
209 *
210 * Single-Ranked stick: A Single-ranked stick has 1 chip-select row of memory.
211 * Motherboards commonly drive two chip-select pins to
212 * a memory stick. A single-ranked stick, will occupy
213 * only one of those rows. The other will be unused.
214 *
215 * Double-Ranked stick: A double-ranked stick has two chip-select rows which
216 * access different sets of memory devices. The two
217 * rows cannot be accessed concurrently.
218 *
219 * Double-sided stick: DEPRECATED TERM, see Double-Ranked stick.
220 * A double-sided stick has two chip-select rows which
221 * access different sets of memory devices. The two
222 * rows cannot be accessed concurrently. "Double-sided"
223 * is irrespective of the memory devices being mounted
224 * on both sides of the memory stick.
225 *
226 * Socket set: All of the memory sticks that are required for
227 * a single memory access or all of the memory sticks
228 * spanned by a chip-select row. A single socket set
229 * has two chip-select rows and if double-sided sticks
230 * are used these will occupy those chip-select rows.
231 *
232 * Bank: This term is avoided because it is unclear when
233 * needing to distinguish between chip-select rows and
234 * socket sets.
235 *
236 * Controller pages:
237 *
238 * Physical pages:
239 *
240 * Virtual pages:
241 *
242 *
243 * STRUCTURE ORGANIZATION AND CHOICES
244 *
245 *
246 *
247 * PS - I enjoyed writing all that about as much as you enjoyed reading it.
248 */
249
250struct channel_info {
251 int chan_idx; /* channel index */
252 u32 ce_count; /* Correctable Errors for this CHANNEL */
253 char label[EDAC_MC_LABEL_LEN + 1]; /* DIMM label on motherboard */
254 struct csrow_info *csrow; /* the parent */
255};
256
257struct csrow_info {
258 unsigned long first_page; /* first page number in dimm */
259 unsigned long last_page; /* last page number in dimm */
260 unsigned long page_mask; /* used for interleaving -
261 * 0UL for non intlv
262 */
263 u32 nr_pages; /* number of pages in csrow */
264 u32 grain; /* granularity of reported error in bytes */
265 int csrow_idx; /* the chip-select row */
266 enum dev_type dtype; /* memory device type */
267 u32 ue_count; /* Uncorrectable Errors for this csrow */
268 u32 ce_count; /* Correctable Errors for this csrow */
269 enum mem_type mtype; /* memory csrow type */
270 enum edac_type edac_mode; /* EDAC mode for this csrow */
271 struct mem_ctl_info *mci; /* the parent */
272
273 struct kobject kobj; /* sysfs kobject for this csrow */
274
275 /* channel information for this csrow */
276 u32 nr_channels;
277 struct channel_info *channels;
278};
279
280struct mcidev_sysfs_group {
281 const char *name; /* group name */
282 const struct mcidev_sysfs_attribute *mcidev_attr; /* group attributes */
283};
284
285struct mcidev_sysfs_group_kobj {
286 struct list_head list; /* list for all instances within a mc */
287
288 struct kobject kobj; /* kobj for the group */
289
290 const struct mcidev_sysfs_group *grp; /* group description table */
291 struct mem_ctl_info *mci; /* the parent */
292};
293
294/* mcidev_sysfs_attribute structure
295 * used for driver sysfs attributes and in mem_ctl_info
296 * sysfs top level entries
297 */
298struct mcidev_sysfs_attribute {
299 /* It should use either attr or grp */
300 struct attribute attr;
301 const struct mcidev_sysfs_group *grp; /* Points to a group of attributes */
302
303 /* Ops for show/store values at the attribute - not used on group */
304 ssize_t (*show)(struct mem_ctl_info *,char *);
305 ssize_t (*store)(struct mem_ctl_info *, const char *,size_t);
306};
307
308/* MEMORY controller information structure
309 */
310struct mem_ctl_info {
311 struct list_head link; /* for global list of mem_ctl_info structs */
312
313 struct module *owner; /* Module owner of this control struct */
314
315 unsigned long mtype_cap; /* memory types supported by mc */
316 unsigned long edac_ctl_cap; /* Mem controller EDAC capabilities */
317 unsigned long edac_cap; /* configuration capabilities - this is
318 * closely related to edac_ctl_cap. The
319 * difference is that the controller may be
320 * capable of s4ecd4ed which would be listed
321 * in edac_ctl_cap, but if channels aren't
322 * capable of s4ecd4ed then the edac_cap would
323 * not have that capability.
324 */
325 unsigned long scrub_cap; /* chipset scrub capabilities */
326 enum scrub_type scrub_mode; /* current scrub mode */
327
328 /* Translates sdram memory scrub rate given in bytes/sec to the
329 internal representation and configures whatever else needs
330 to be configured.
331 */
332 int (*set_sdram_scrub_rate) (struct mem_ctl_info * mci, u32 bw);
333
334 /* Get the current sdram memory scrub rate from the internal
335 representation and converts it to the closest matching
336 bandwidth in bytes/sec.
337 */
338 int (*get_sdram_scrub_rate) (struct mem_ctl_info * mci);
339
340
341 /* pointer to edac checking routine */
342 void (*edac_check) (struct mem_ctl_info * mci);
343
344 /*
345 * Remaps memory pages: controller pages to physical pages.
346 * For most MC's, this will be NULL.
347 */
348 /* FIXME - why not send the phys page to begin with? */
349 unsigned long (*ctl_page_to_phys) (struct mem_ctl_info * mci,
350 unsigned long page);
351 int mc_idx;
352 int nr_csrows;
353 struct csrow_info *csrows;
354 /*
355 * FIXME - what about controllers on other busses? - IDs must be
356 * unique. dev pointer should be sufficiently unique, but
357 * BUS:SLOT.FUNC numbers may not be unique.
358 */
359 struct device *dev;
360 const char *mod_name;
361 const char *mod_ver;
362 const char *ctl_name;
363 const char *dev_name;
364 char proc_name[MC_PROC_NAME_MAX_LEN + 1];
365 void *pvt_info;
366 u32 ue_noinfo_count; /* Uncorrectable Errors w/o info */
367 u32 ce_noinfo_count; /* Correctable Errors w/o info */
368 u32 ue_count; /* Total Uncorrectable Errors for this MC */
369 u32 ce_count; /* Total Correctable Errors for this MC */
370 unsigned long start_time; /* mci load start time (in jiffies) */
371
372 struct completion complete;
373
374 /* edac sysfs device control */
375 struct kobject edac_mci_kobj;
376
377 /* list for all grp instances within a mc */
378 struct list_head grp_kobj_list;
379
380 /* Additional top controller level attributes, but specified
381 * by the low level driver.
382 *
383 * Set by the low level driver to provide attributes at the
384 * controller level, same level as 'ue_count' and 'ce_count' above.
385 * An array of structures, NULL terminated
386 *
387 * If attributes are desired, then set to array of attributes
388 * If no attributes are desired, leave NULL
389 */
390 const struct mcidev_sysfs_attribute *mc_driver_sysfs_attributes;
391
392 /* work struct for this MC */
393 struct delayed_work work;
394
395 /* the internal state of this controller instance */
396 int op_state;
397};
398
Dave Jiangc0d12172007-07-19 01:49:46 -0700399#endif