blob: 53f056f7a44eb969e5fd0942ac6c6f3a89ff89e8 [file] [log] [blame]
Scott Andersonb0114cb2012-04-09 14:08:22 -07001// Copyright 2008 Google Inc. All Rights Reserved.
2
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6
7// http://www.apache.org/licenses/LICENSE-2.0
8
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// error_diag.cc: Collects device errors for analysis to more accurately
16// pin-point failed component.
17
18#include <set>
19#include <list>
20#include <map>
21
22// This file must work with autoconf on its public version,
23// so these includes are correct.
24#include "error_diag.h"
25#include "sattypes.h"
26
27
28// DeviceTree constructor.
29DeviceTree::DeviceTree(string name)
30 : parent_(0), name_(name) {
31 pthread_mutex_init(&device_tree_mutex_, NULL);
32}
33
34// DeviceTree destructor.
35DeviceTree::~DeviceTree() {
36 // Deallocate subtree devices.
37 for (std::map<string, DeviceTree*>::iterator itr = subdevices_.begin();
38 itr != subdevices_.end();
39 ++itr) {
40 delete itr->second;
41 }
42 // Deallocate device errors.
43 for (std::list<ErrorInstance*>::iterator itr = errors_.begin();
44 itr != errors_.end();
45 ++itr) {
46 delete (*itr);
47 }
48 pthread_mutex_destroy(&device_tree_mutex_);
49}
50
51// Atomically find named device in sub device tree.
52// Returns 0 if not found
53DeviceTree *DeviceTree::FindInSubTree(string name) {
54 DeviceTree *ret;
55 pthread_mutex_lock(&device_tree_mutex_);
56 ret = UnlockedFindInSubTree(name);
57 pthread_mutex_unlock(&device_tree_mutex_);
58 return ret;
59}
60
61// Find named device in sub device tree (Non-atomic).
62// Returns 0 if not found
63DeviceTree *DeviceTree::UnlockedFindInSubTree(string name) {
64 std::map<string, DeviceTree*>::iterator itr = subdevices_.find(name);
65 if (itr != subdevices_.end()) {
66 return itr->second;
67 } else {
68 // Search sub-tree.
69 for (std::map<string, DeviceTree*>::iterator itr = subdevices_.begin();
70 itr != subdevices_.end();
71 ++itr) {
72 DeviceTree *result = itr->second->UnlockedFindInSubTree(name);
73 if (result != 0)
74 return result;
75 }
76 return 0;
77 }
78}
79
80// Atomically add error instance to device.
81void DeviceTree::AddErrorInstance(ErrorInstance *error_instance) {
82 pthread_mutex_lock(&device_tree_mutex_);
83 errors_.push_back(error_instance);
84 pthread_mutex_unlock(&device_tree_mutex_);
85}
86
87// Find or add queried device as necessary.
88DeviceTree *DeviceTree::FindOrAddDevice(string name) {
89 // Assume named device does not exist and try to insert the device anyway.
90 // No-op if named device already exists.
91 InsertSubDevice(name);
92 // Find and return sub device pointer.
93 return FindInSubTree(name);
94}
95
96// Pretty prints device tree.
97void DeviceTree::PrettyPrint(string spacer) {
98 for (std::map<string, DeviceTree*>::iterator itr = subdevices_.begin();
99 itr != subdevices_.end();
100 ++itr) {
101 printf("%s%s\n", spacer.c_str(), itr->first.c_str());
102 itr->second->PrettyPrint(spacer+spacer);
103 }
104}
105
106// Atomically add sub device.
107// No-op if named device already exists.
108void DeviceTree::InsertSubDevice(string name) {
109 pthread_mutex_lock(&device_tree_mutex_);
110 if (UnlockedFindInSubTree(name) != 0) {
111 pthread_mutex_unlock(&device_tree_mutex_);
112 return;
113 }
114 subdevices_[name] = new DeviceTree(name);
115 subdevices_[name]->parent_ = this;
116 pthread_mutex_unlock(&device_tree_mutex_);
117}
118
119
120// Returns true of any error associated with this device is fatal.
121bool DeviceTree::KnownBad() {
122 pthread_mutex_lock(&device_tree_mutex_);
123 for (std::list<ErrorInstance*>::iterator itr = errors_.begin();
124 itr != errors_.end();
125 ++itr) {
126 if ((*itr)->severity_ == SAT_ERROR_FATAL) {
127 pthread_mutex_unlock(&device_tree_mutex_);
128 return true;
129 }
130 }
131 pthread_mutex_unlock(&device_tree_mutex_);
132 return false;
133}
134
135
136// ErrorDiag constructor.
137ErrorDiag::ErrorDiag() {
138 os_ = 0;
139 system_tree_root_ = 0;
140}
141
142// ErrorDiag destructor.
143ErrorDiag::~ErrorDiag() {
144 if (system_tree_root_)
145 delete system_tree_root_;
146}
147
148// Set platform specific handle and initialize device tree.
149// Returns false on error. true otherwise.
150bool ErrorDiag::set_os(OsLayer *os) {
151 os_ = os;
152 return(InitializeDeviceTree());
153}
154
155// Create and initialize system device tree.
156// Returns false on error. true otherwise.
157bool ErrorDiag::InitializeDeviceTree() {
158 system_tree_root_ = new DeviceTree("system_root");
159 if (!system_tree_root_)
160 return false;
161 return true;
162}
163
164// Logs info about a CECC.
165// Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
166int ErrorDiag::AddCeccError(string dimm_string) {
167 DeviceTree *dimm_device = system_tree_root_->FindOrAddDevice(dimm_string);
168 ECCErrorInstance *error = new ECCErrorInstance;
169 if (!error)
170 return -1;
171 error->severity_ = SAT_ERROR_CORRECTABLE;
172 dimm_device->AddErrorInstance(error);
173 return 0;
174}
175
176// Logs info about a UECC.
177// Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
178int ErrorDiag::AddUeccError(string dimm_string) {
179 DeviceTree *dimm_device = system_tree_root_->FindOrAddDevice(dimm_string);
180 ECCErrorInstance *error = new ECCErrorInstance;
181 if (!error)
182 return -1;
183 error->severity_ = SAT_ERROR_FATAL;
184 dimm_device->AddErrorInstance(error);
185 return 0;
186}
187
188// Logs info about a miscompare.
189// Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
190int ErrorDiag::AddMiscompareError(string dimm_string, uint64 addr, int count) {
191 DeviceTree *dimm_device = system_tree_root_->FindOrAddDevice(dimm_string);
192 MiscompareErrorInstance *error = new MiscompareErrorInstance;
193 if (!error)
194 return -1;
195 error->severity_ = SAT_ERROR_FATAL;
196 error->addr_ = addr;
197 dimm_device->AddErrorInstance(error);
198 os_->ErrorReport(dimm_string.c_str(), "miscompare", count);
199 return 1;
200}
201
202// Utility Function to translate a virtual address to DIMM number.
203// Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
204string ErrorDiag::AddressToDimmString(OsLayer *os, void *addr, int offset) {
205 char dimm_string[256] = "";
206 char *vbyteaddr = reinterpret_cast<char*>(addr) + offset;
207 uint64 paddr = os->VirtualToPhysical(vbyteaddr);
208 os->FindDimm(paddr, dimm_string, sizeof(dimm_string));
209 return string(dimm_string);
210}
211
212// Info about a miscompare from a drive.
213// Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
214int ErrorDiag::AddHDDMiscompareError(string devicename, int block, int offset,
215 void *src_addr, void *dst_addr) {
216 bool mask_hdd_error = false;
217
218 HDDMiscompareErrorInstance *error = new HDDMiscompareErrorInstance;
219 if (!error)
220 return -1;
221
222 error->addr_ = reinterpret_cast<uint64>(src_addr);
223 error->addr2_ = reinterpret_cast<uint64>(dst_addr);
224 error->offset_ = offset;
225 error->block_ = block;
226
227 string src_dimm = AddressToDimmString(os_, src_addr, offset);
228 string dst_dimm = AddressToDimmString(os_, dst_addr, offset);
229
230 // DIMM name look up success
231 if (src_dimm.compare("DIMM Unknown")) {
232 // Add src DIMM as possible miscompare cause.
233 DeviceTree *src_dimm_dev = system_tree_root_->FindOrAddDevice(src_dimm);
234 error->causes_.insert(src_dimm_dev);
235 if (src_dimm_dev->KnownBad()) {
236 mask_hdd_error = true;
237 logprintf(5, "Log: supressed %s miscompare report: "
238 "known bad source: %s\n", devicename.c_str(), src_dimm.c_str());
239 }
240 }
241 if (dst_dimm.compare("DIMM Unknown")) {
242 // Add dst DIMM as possible miscompare cause.
243 DeviceTree *dst_dimm_dev = system_tree_root_->FindOrAddDevice(dst_dimm);
244 error->causes_.insert(dst_dimm_dev);
245 if (dst_dimm_dev->KnownBad()) {
246 mask_hdd_error = true;
247 logprintf(5, "Log: supressed %s miscompare report: "
248 "known bad destination: %s\n", devicename.c_str(),
249 dst_dimm.c_str());
250 }
251 }
252
253 DeviceTree *hdd_dev = system_tree_root_->FindOrAddDevice(devicename);
254 hdd_dev->AddErrorInstance(error);
255
256 // HDD error was not masked by bad DIMMs: report bad HDD.
257 if (!mask_hdd_error) {
258 os_->ErrorReport(devicename.c_str(), "miscompare", 1);
259 error->severity_ = SAT_ERROR_FATAL;
260 return 1;
261 }
262 return 0;
263}
264
265// Info about a sector tag miscompare from a drive.
266// Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
267int ErrorDiag::AddHDDSectorTagError(string devicename, int block, int offset,
268 int sector, void *src_addr,
269 void *dst_addr) {
270 bool mask_hdd_error = false;
271
272 HDDSectorTagErrorInstance *error = new HDDSectorTagErrorInstance;
273 if (!error)
274 return -1;
275
276 error->addr_ = reinterpret_cast<uint64>(src_addr);
277 error->addr2_ = reinterpret_cast<uint64>(dst_addr);
278 error->sector_ = sector;
279 error->block_ = block;
280
281 string src_dimm = AddressToDimmString(os_, src_addr, offset);
282 string dst_dimm = AddressToDimmString(os_, dst_addr, offset);
283
284 // DIMM name look up success
285 if (src_dimm.compare("DIMM Unknown")) {
286 // Add src DIMM as possible miscompare cause.
287 DeviceTree *src_dimm_dev = system_tree_root_->FindOrAddDevice(src_dimm);
288 error->causes_.insert(src_dimm_dev);
289 if (src_dimm_dev->KnownBad()) {
290 mask_hdd_error = true;
291 logprintf(5, "Log: supressed %s sector tag error report: "
292 "known bad source: %s\n", devicename.c_str(), src_dimm.c_str());
293 }
294 }
295 if (dst_dimm.compare("DIMM Unknown")) {
296 // Add dst DIMM as possible miscompare cause.
297 DeviceTree *dst_dimm_dev = system_tree_root_->FindOrAddDevice(dst_dimm);
298 error->causes_.insert(dst_dimm_dev);
299 if (dst_dimm_dev->KnownBad()) {
300 mask_hdd_error = true;
301 logprintf(5, "Log: supressed %s sector tag error report: "
302 "known bad destination: %s\n", devicename.c_str(),
303 dst_dimm.c_str());
304 }
305 }
306
307 DeviceTree *hdd_dev = system_tree_root_->FindOrAddDevice(devicename);
308 hdd_dev->AddErrorInstance(error);
309
310 // HDD error was not masked by bad DIMMs: report bad HDD.
311 if (!mask_hdd_error) {
312 os_->ErrorReport(devicename.c_str(), "sector", 1);
313 error->severity_ = SAT_ERROR_FATAL;
314 return 1;
315 }
316 return 0;
317}