blob: 2dae4643df0d32e328998e35c59415427b4eefa7 [file] [log] [blame]
Scott Andersonb0114cb2012-04-09 14:08:22 -07001// Copyright 2006 Google Inc. All Rights Reserved.
2
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6
7// http://www.apache.org/licenses/LICENSE-2.0
8
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// worker.cc : individual tasks that can be run in combination to
16// stress the system
17
18#include <errno.h>
19#include <pthread.h>
20#include <sched.h>
21#include <signal.h>
22#include <stdlib.h>
23#include <stdio.h>
24#include <stdint.h>
25#include <string.h>
26#include <time.h>
27#include <unistd.h>
28
29#include <sys/select.h>
30#include <sys/stat.h>
31#include <sys/types.h>
32#include <sys/times.h>
33
34// These are necessary, but on by default
35// #define __USE_GNU
36// #define __USE_LARGEFILE64
37#include <fcntl.h>
38#include <sys/socket.h>
39#include <netdb.h>
40#include <arpa/inet.h>
41#include <linux/unistd.h> // for gettid
42
43// For size of block device
44#include <sys/ioctl.h>
45#include <linux/fs.h>
46// For asynchronous I/O
Scott Anderson8f1c60d2012-02-17 14:25:17 -080047#ifdef HAVE_LIBAIO_H
Scott Andersonb0114cb2012-04-09 14:08:22 -070048#include <libaio.h>
Scott Anderson8f1c60d2012-02-17 14:25:17 -080049#endif
Scott Andersonb0114cb2012-04-09 14:08:22 -070050
51#include <sys/syscall.h>
52
53#include <set>
54#include <string>
55
56// This file must work with autoconf on its public version,
57// so these includes are correct.
58#include "error_diag.h" // NOLINT
59#include "os.h" // NOLINT
60#include "pattern.h" // NOLINT
61#include "queue.h" // NOLINT
62#include "sat.h" // NOLINT
63#include "sattypes.h" // NOLINT
64#include "worker.h" // NOLINT
65
66// Syscalls
67// Why ubuntu, do you hate gettid so bad?
68#if !defined(__NR_gettid)
69 #define __NR_gettid 224
70#endif
71
72#define gettid() syscall(__NR_gettid)
73#if !defined(CPU_SETSIZE)
74_syscall3(int, sched_getaffinity, pid_t, pid,
75 unsigned int, len, cpu_set_t*, mask)
76_syscall3(int, sched_setaffinity, pid_t, pid,
77 unsigned int, len, cpu_set_t*, mask)
78#endif
79
Scott Andersonb0114cb2012-04-09 14:08:22 -070080namespace {
81 // Get HW core ID from cpuid instruction.
82 inline int apicid(void) {
83 int cpu;
84#if defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686)
85 __asm __volatile("cpuid" : "=b" (cpu) : "a" (1) : "cx", "dx");
86#elif defined(STRESSAPPTEST_CPU_ARMV7A)
87 #warning "Unsupported CPU type ARMV7A: unable to determine core ID."
88 cpu = 0;
89#else
90 #warning "Unsupported CPU type: unable to determine core ID."
91 cpu = 0;
92#endif
93 return (cpu >> 24);
94 }
95
96 // Work around the sad fact that there are two (gnu, xsi) incompatible
97 // versions of strerror_r floating around google. Awesome.
98 bool sat_strerror(int err, char *buf, int len) {
99 buf[0] = 0;
100 char *errmsg = reinterpret_cast<char*>(strerror_r(err, buf, len));
101 int retval = reinterpret_cast<int64>(errmsg);
102 if (retval == 0)
103 return true;
104 if (retval == -1)
105 return false;
106 if (errmsg != buf) {
107 strncpy(buf, errmsg, len);
108 buf[len - 1] = 0;
109 }
110 return true;
111 }
112
113
114 inline uint64 addr_to_tag(void *address) {
115 return reinterpret_cast<uint64>(address);
116 }
117}
118
119#if !defined(O_DIRECT)
120// Sometimes this isn't available.
121// Disregard if it's not defined.
122 #define O_DIRECT 0
123#endif
124
125// A struct to hold captured errors, for later reporting.
126struct ErrorRecord {
127 uint64 actual; // This is the actual value read.
128 uint64 reread; // This is the actual value, reread.
129 uint64 expected; // This is what it should have been.
130 uint64 *vaddr; // This is where it was (or wasn't).
131 char *vbyteaddr; // This is byte specific where the data was (or wasn't).
132 uint64 paddr; // This is the bus address, if available.
133 uint64 *tagvaddr; // This holds the tag value if this data was tagged.
134 uint64 tagpaddr; // This holds the physical address corresponding to the tag.
135};
136
137// This is a helper function to create new threads with pthreads.
138static void *ThreadSpawnerGeneric(void *ptr) {
139 WorkerThread *worker = static_cast<WorkerThread*>(ptr);
140 worker->StartRoutine();
141 return NULL;
142}
143
144void WorkerStatus::Initialize() {
145 sat_assert(0 == pthread_mutex_init(&num_workers_mutex_, NULL));
146 sat_assert(0 == pthread_rwlock_init(&status_rwlock_, NULL));
Scott Anderson8f1c60d2012-02-17 14:25:17 -0800147#ifdef _POSIX_BARRIERS
Scott Andersonb0114cb2012-04-09 14:08:22 -0700148 sat_assert(0 == pthread_barrier_init(&pause_barrier_, NULL,
149 num_workers_ + 1));
Scott Anderson8f1c60d2012-02-17 14:25:17 -0800150#endif
Scott Andersonb0114cb2012-04-09 14:08:22 -0700151}
152
153void WorkerStatus::Destroy() {
154 sat_assert(0 == pthread_mutex_destroy(&num_workers_mutex_));
155 sat_assert(0 == pthread_rwlock_destroy(&status_rwlock_));
Scott Anderson8f1c60d2012-02-17 14:25:17 -0800156#ifdef _POSIX_BARRIERS
Scott Andersonb0114cb2012-04-09 14:08:22 -0700157 sat_assert(0 == pthread_barrier_destroy(&pause_barrier_));
Scott Anderson8f1c60d2012-02-17 14:25:17 -0800158#endif
Scott Andersonb0114cb2012-04-09 14:08:22 -0700159}
160
161void WorkerStatus::PauseWorkers() {
162 if (SetStatus(PAUSE) != PAUSE)
163 WaitOnPauseBarrier();
164}
165
166void WorkerStatus::ResumeWorkers() {
167 if (SetStatus(RUN) == PAUSE)
168 WaitOnPauseBarrier();
169}
170
171void WorkerStatus::StopWorkers() {
172 if (SetStatus(STOP) == PAUSE)
173 WaitOnPauseBarrier();
174}
175
176bool WorkerStatus::ContinueRunning() {
177 // This loop is an optimization. We use it to immediately re-check the status
178 // after resuming from a pause, instead of returning and waiting for the next
179 // call to this function.
180 for (;;) {
181 switch (GetStatus()) {
182 case RUN:
183 return true;
184 case PAUSE:
185 // Wait for the other workers to call this function so that
186 // PauseWorkers() can return.
187 WaitOnPauseBarrier();
188 // Wait for ResumeWorkers() to be called.
189 WaitOnPauseBarrier();
190 break;
191 case STOP:
192 return false;
193 }
194 }
195}
196
197bool WorkerStatus::ContinueRunningNoPause() {
198 return (GetStatus() != STOP);
199}
200
201void WorkerStatus::RemoveSelf() {
202 // Acquire a read lock on status_rwlock_ while (status_ != PAUSE).
203 for (;;) {
204 AcquireStatusReadLock();
205 if (status_ != PAUSE)
206 break;
207 // We need to obey PauseWorkers() just like ContinueRunning() would, so that
208 // the other threads won't wait on pause_barrier_ forever.
209 ReleaseStatusLock();
210 // Wait for the other workers to call this function so that PauseWorkers()
211 // can return.
212 WaitOnPauseBarrier();
213 // Wait for ResumeWorkers() to be called.
214 WaitOnPauseBarrier();
215 }
216
217 // This lock would be unnecessary if we held a write lock instead of a read
218 // lock on status_rwlock_, but that would also force all threads calling
219 // ContinueRunning() to wait on this one. Using a separate lock avoids that.
220 AcquireNumWorkersLock();
221 // Decrement num_workers_ and reinitialize pause_barrier_, which we know isn't
222 // in use because (status != PAUSE).
Scott Anderson8f1c60d2012-02-17 14:25:17 -0800223#ifdef _POSIX_BARRIERS
Scott Andersonb0114cb2012-04-09 14:08:22 -0700224 sat_assert(0 == pthread_barrier_destroy(&pause_barrier_));
225 sat_assert(0 == pthread_barrier_init(&pause_barrier_, NULL, num_workers_));
Scott Anderson8f1c60d2012-02-17 14:25:17 -0800226#endif
Scott Andersonb0114cb2012-04-09 14:08:22 -0700227 --num_workers_;
228 ReleaseNumWorkersLock();
229
230 // Release status_rwlock_.
231 ReleaseStatusLock();
232}
233
234
235// Parent thread class.
236WorkerThread::WorkerThread() {
237 status_ = false;
238 pages_copied_ = 0;
239 errorcount_ = 0;
240 runduration_usec_ = 1;
241 priority_ = Normal;
242 worker_status_ = NULL;
243 thread_spawner_ = &ThreadSpawnerGeneric;
244 tag_mode_ = false;
245}
246
247WorkerThread::~WorkerThread() {}
248
249// Constructors. Just init some default values.
250FillThread::FillThread() {
251 num_pages_to_fill_ = 0;
252}
253
254// Initialize file name to empty.
255FileThread::FileThread() {
256 filename_ = "";
257 devicename_ = "";
258 pass_ = 0;
259 page_io_ = true;
260 crc_page_ = -1;
261 local_page_ = NULL;
262}
263
264// If file thread used bounce buffer in memory, account for the extra
265// copy for memory bandwidth calculation.
266float FileThread::GetMemoryCopiedData() {
267 if (!os_->normal_mem())
268 return GetCopiedData();
269 else
270 return 0;
271}
272
273// Initialize target hostname to be invalid.
274NetworkThread::NetworkThread() {
275 snprintf(ipaddr_, sizeof(ipaddr_), "Unknown");
276 sock_ = 0;
277}
278
279// Initialize?
280NetworkSlaveThread::NetworkSlaveThread() {
281}
282
283// Initialize?
284NetworkListenThread::NetworkListenThread() {
285}
286
287// Init member variables.
288void WorkerThread::InitThread(int thread_num_init,
289 class Sat *sat_init,
290 class OsLayer *os_init,
291 class PatternList *patternlist_init,
292 WorkerStatus *worker_status) {
293 sat_assert(worker_status);
294 worker_status->AddWorkers(1);
295
296 thread_num_ = thread_num_init;
297 sat_ = sat_init;
298 os_ = os_init;
299 patternlist_ = patternlist_init;
300 worker_status_ = worker_status;
301
302 AvailableCpus(&cpu_mask_);
303 tag_ = 0xffffffff;
304
305 tag_mode_ = sat_->tag_mode();
306}
307
308
309// Use pthreads to prioritize a system thread.
310bool WorkerThread::InitPriority() {
311 // This doesn't affect performance that much, and may not be too safe.
312
313 bool ret = BindToCpus(&cpu_mask_);
314 if (!ret)
315 logprintf(11, "Log: Bind to %s failed.\n",
316 cpuset_format(&cpu_mask_).c_str());
317
318 logprintf(11, "Log: Thread %d running on apic ID %d mask %s (%s).\n",
319 thread_num_, apicid(),
320 CurrentCpusFormat().c_str(),
321 cpuset_format(&cpu_mask_).c_str());
322#if 0
323 if (priority_ == High) {
324 sched_param param;
325 param.sched_priority = 1;
326 // Set the priority; others are unchanged.
327 logprintf(0, "Log: Changing priority to SCHED_FIFO %d\n",
328 param.sched_priority);
329 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
330 char buf[256];
331 sat_strerror(errno, buf, sizeof(buf));
332 logprintf(0, "Process Error: sched_setscheduler "
333 "failed - error %d %s\n",
334 errno, buf);
335 }
336 }
337#endif
338 return true;
339}
340
341// Use pthreads to create a system thread.
342int WorkerThread::SpawnThread() {
343 // Create the new thread.
344 int result = pthread_create(&thread_, NULL, thread_spawner_, this);
345 if (result) {
346 char buf[256];
347 sat_strerror(result, buf, sizeof(buf));
348 logprintf(0, "Process Error: pthread_create "
349 "failed - error %d %s\n", result,
350 buf);
351 status_ = false;
352 return false;
353 }
354
355 // 0 is pthreads success.
356 return true;
357}
358
359// Kill the worker thread with SIGINT.
360bool WorkerThread::KillThread() {
361 return (pthread_kill(thread_, SIGINT) == 0);
362}
363
364// Block until thread has exited.
365bool WorkerThread::JoinThread() {
366 int result = pthread_join(thread_, NULL);
367
368 if (result) {
369 logprintf(0, "Process Error: pthread_join failed - error %d\n", result);
370 status_ = false;
371 }
372
373 // 0 is pthreads success.
374 return (!result);
375}
376
377
378void WorkerThread::StartRoutine() {
379 InitPriority();
380 StartThreadTimer();
381 Work();
382 StopThreadTimer();
383 worker_status_->RemoveSelf();
384}
385
386
387// Thread work loop. Execute until marked finished.
388bool WorkerThread::Work() {
389 do {
390 logprintf(9, "Log: ...\n");
391 // Sleep for 1 second.
392 sat_sleep(1);
393 } while (IsReadyToRun());
394
395 return false;
396}
397
398
399// Returns CPU mask of CPUs available to this process,
400// Conceptually, each bit represents a logical CPU, ie:
401// mask = 3 (11b): cpu0, 1
402// mask = 13 (1101b): cpu0, 2, 3
403bool WorkerThread::AvailableCpus(cpu_set_t *cpuset) {
404 CPU_ZERO(cpuset);
Scott Anderson8f1c60d2012-02-17 14:25:17 -0800405#ifdef HAVE_SCHED_GETAFFINITY
Scott Andersonb0114cb2012-04-09 14:08:22 -0700406 return sched_getaffinity(getppid(), sizeof(*cpuset), cpuset) == 0;
Scott Anderson8f1c60d2012-02-17 14:25:17 -0800407#else
408 return 0;
409#endif
Scott Andersonb0114cb2012-04-09 14:08:22 -0700410}
411
412
413// Returns CPU mask of CPUs this thread is bound to,
414// Conceptually, each bit represents a logical CPU, ie:
415// mask = 3 (11b): cpu0, 1
416// mask = 13 (1101b): cpu0, 2, 3
417bool WorkerThread::CurrentCpus(cpu_set_t *cpuset) {
418 CPU_ZERO(cpuset);
Scott Anderson8f1c60d2012-02-17 14:25:17 -0800419#ifdef HAVE_SCHED_GETAFFINITY
Scott Andersonb0114cb2012-04-09 14:08:22 -0700420 return sched_getaffinity(0, sizeof(*cpuset), cpuset) == 0;
Scott Anderson8f1c60d2012-02-17 14:25:17 -0800421#else
422 return 0;
423#endif
Scott Andersonb0114cb2012-04-09 14:08:22 -0700424}
425
426
427// Bind worker thread to specified CPU(s)
428// Args:
429// thread_mask: cpu_set_t representing CPUs, ie
430// mask = 1 (01b): cpu0
431// mask = 3 (11b): cpu0, 1
432// mask = 13 (1101b): cpu0, 2, 3
433//
434// Returns true on success, false otherwise.
435bool WorkerThread::BindToCpus(const cpu_set_t *thread_mask) {
436 cpu_set_t process_mask;
437 AvailableCpus(&process_mask);
438 if (cpuset_isequal(thread_mask, &process_mask))
439 return true;
440
441 logprintf(11, "Log: available CPU mask - %s\n",
442 cpuset_format(&process_mask).c_str());
443 if (!cpuset_issubset(thread_mask, &process_mask)) {
444 // Invalid cpu_mask, ie cpu not allocated to this process or doesn't exist.
445 logprintf(0, "Log: requested CPUs %s not a subset of available %s\n",
446 cpuset_format(thread_mask).c_str(),
447 cpuset_format(&process_mask).c_str());
448 return false;
449 }
Scott Anderson8f1c60d2012-02-17 14:25:17 -0800450#ifdef HAVE_SCHED_GETAFFINITY
Scott Andersonb0114cb2012-04-09 14:08:22 -0700451 return (sched_setaffinity(gettid(), sizeof(*thread_mask), thread_mask) == 0);
Scott Anderson8f1c60d2012-02-17 14:25:17 -0800452#else
453 return 0;
454#endif
Scott Andersonb0114cb2012-04-09 14:08:22 -0700455}
456
457
458// A worker thread can yield itself to give up CPU until it's scheduled again.
459// Returns true on success, false on error.
460bool WorkerThread::YieldSelf() {
461 return (sched_yield() == 0);
462}
463
464
465// Fill this page with its pattern.
466bool WorkerThread::FillPage(struct page_entry *pe) {
467 // Error check arguments.
468 if (pe == 0) {
469 logprintf(0, "Process Error: Fill Page entry null\n");
470 return 0;
471 }
472
473 // Mask is the bitmask of indexes used by the pattern.
474 // It is the pattern size -1. Size is always a power of 2.
475 uint64 *memwords = static_cast<uint64*>(pe->addr);
476 int length = sat_->page_length();
477
478 if (tag_mode_) {
479 // Select tag or data as appropriate.
480 for (int i = 0; i < length / wordsize_; i++) {
481 datacast_t data;
482
483 if ((i & 0x7) == 0) {
484 data.l64 = addr_to_tag(&memwords[i]);
485 } else {
486 data.l32.l = pe->pattern->pattern(i << 1);
487 data.l32.h = pe->pattern->pattern((i << 1) + 1);
488 }
489 memwords[i] = data.l64;
490 }
491 } else {
492 // Just fill in untagged data directly.
493 for (int i = 0; i < length / wordsize_; i++) {
494 datacast_t data;
495
496 data.l32.l = pe->pattern->pattern(i << 1);
497 data.l32.h = pe->pattern->pattern((i << 1) + 1);
498 memwords[i] = data.l64;
499 }
500 }
501
502 return 1;
503}
504
505
506// Tell the thread how many pages to fill.
507void FillThread::SetFillPages(int64 num_pages_to_fill_init) {
508 num_pages_to_fill_ = num_pages_to_fill_init;
509}
510
511// Fill this page with a random pattern.
512bool FillThread::FillPageRandom(struct page_entry *pe) {
513 // Error check arguments.
514 if (pe == 0) {
515 logprintf(0, "Process Error: Fill Page entry null\n");
516 return 0;
517 }
518 if ((patternlist_ == 0) || (patternlist_->Size() == 0)) {
519 logprintf(0, "Process Error: No data patterns available\n");
520 return 0;
521 }
522
523 // Choose a random pattern for this block.
524 pe->pattern = patternlist_->GetRandomPattern();
525 if (pe->pattern == 0) {
526 logprintf(0, "Process Error: Null data pattern\n");
527 return 0;
528 }
529
530 // Actually fill the page.
531 return FillPage(pe);
532}
533
534
535// Memory fill work loop. Execute until alloted pages filled.
536bool FillThread::Work() {
537 bool result = true;
538
539 logprintf(9, "Log: Starting fill thread %d\n", thread_num_);
540
541 // We want to fill num_pages_to_fill pages, and
542 // stop when we've filled that many.
543 // We also want to capture early break
544 struct page_entry pe;
545 int64 loops = 0;
546 while (IsReadyToRun() && (loops < num_pages_to_fill_)) {
547 result = result && sat_->GetEmpty(&pe);
548 if (!result) {
549 logprintf(0, "Process Error: fill_thread failed to pop pages, "
550 "bailing\n");
551 break;
552 }
553
554 // Fill the page with pattern
555 result = result && FillPageRandom(&pe);
556 if (!result) break;
557
558 // Put the page back on the queue.
559 result = result && sat_->PutValid(&pe);
560 if (!result) {
561 logprintf(0, "Process Error: fill_thread failed to push pages, "
562 "bailing\n");
563 break;
564 }
565 loops++;
566 }
567
568 // Fill in thread status.
569 pages_copied_ = loops;
570 status_ = result;
571 logprintf(9, "Log: Completed %d: Fill thread. Status %d, %d pages filled\n",
572 thread_num_, status_, pages_copied_);
573 return result;
574}
575
576
577// Print error information about a data miscompare.
578void WorkerThread::ProcessError(struct ErrorRecord *error,
579 int priority,
580 const char *message) {
581 char dimm_string[256] = "";
582
583 int apic_id = apicid();
584
585 // Determine if this is a write or read error.
586 os_->Flush(error->vaddr);
587 error->reread = *(error->vaddr);
588
589 char *good = reinterpret_cast<char*>(&(error->expected));
590 char *bad = reinterpret_cast<char*>(&(error->actual));
591
592 sat_assert(error->expected != error->actual);
593 unsigned int offset = 0;
594 for (offset = 0; offset < (sizeof(error->expected) - 1); offset++) {
595 if (good[offset] != bad[offset])
596 break;
597 }
598
599 error->vbyteaddr = reinterpret_cast<char*>(error->vaddr) + offset;
600
601 // Find physical address if possible.
602 error->paddr = os_->VirtualToPhysical(error->vbyteaddr);
603
604 // Pretty print DIMM mapping if available.
605 os_->FindDimm(error->paddr, dimm_string, sizeof(dimm_string));
606
607 // Report parseable error.
608 if (priority < 5) {
609 // Run miscompare error through diagnoser for logging and reporting.
610 os_->error_diagnoser_->AddMiscompareError(dimm_string,
611 reinterpret_cast<uint64>
612 (error->vaddr), 1);
613
614 logprintf(priority,
615 "%s: miscompare on CPU %d(0x%s) at %p(0x%llx:%s): "
616 "read:0x%016llx, reread:0x%016llx expected:0x%016llx\n",
617 message,
618 apic_id,
619 CurrentCpusFormat().c_str(),
620 error->vaddr,
621 error->paddr,
622 dimm_string,
623 error->actual,
624 error->reread,
625 error->expected);
626 }
627
628
629 // Overwrite incorrect data with correct data to prevent
630 // future miscompares when this data is reused.
631 *(error->vaddr) = error->expected;
632 os_->Flush(error->vaddr);
633}
634
635
636
637// Print error information about a data miscompare.
638void FileThread::ProcessError(struct ErrorRecord *error,
639 int priority,
640 const char *message) {
641 char dimm_string[256] = "";
642
643 // Determine if this is a write or read error.
644 os_->Flush(error->vaddr);
645 error->reread = *(error->vaddr);
646
647 char *good = reinterpret_cast<char*>(&(error->expected));
648 char *bad = reinterpret_cast<char*>(&(error->actual));
649
650 sat_assert(error->expected != error->actual);
651 unsigned int offset = 0;
652 for (offset = 0; offset < (sizeof(error->expected) - 1); offset++) {
653 if (good[offset] != bad[offset])
654 break;
655 }
656
657 error->vbyteaddr = reinterpret_cast<char*>(error->vaddr) + offset;
658
659 // Find physical address if possible.
660 error->paddr = os_->VirtualToPhysical(error->vbyteaddr);
661
662 // Pretty print DIMM mapping if available.
663 os_->FindDimm(error->paddr, dimm_string, sizeof(dimm_string));
664
665 // If crc_page_ is valid, ie checking content read back from file,
666 // track src/dst memory addresses. Otherwise catagorize as general
667 // mememory miscompare for CRC checking everywhere else.
668 if (crc_page_ != -1) {
669 int miscompare_byteoffset = static_cast<char*>(error->vbyteaddr) -
670 static_cast<char*>(page_recs_[crc_page_].dst);
671 os_->error_diagnoser_->AddHDDMiscompareError(devicename_,
672 crc_page_,
673 miscompare_byteoffset,
674 page_recs_[crc_page_].src,
675 page_recs_[crc_page_].dst);
676 } else {
677 os_->error_diagnoser_->AddMiscompareError(dimm_string,
678 reinterpret_cast<uint64>
679 (error->vaddr), 1);
680 }
681
682 logprintf(priority,
683 "%s: miscompare on %s at %p(0x%llx:%s): read:0x%016llx, "
684 "reread:0x%016llx expected:0x%016llx\n",
685 message,
686 devicename_.c_str(),
687 error->vaddr,
688 error->paddr,
689 dimm_string,
690 error->actual,
691 error->reread,
692 error->expected);
693
694 // Overwrite incorrect data with correct data to prevent
695 // future miscompares when this data is reused.
696 *(error->vaddr) = error->expected;
697 os_->Flush(error->vaddr);
698}
699
700
701// Do a word by word result check of a region.
702// Print errors on mismatches.
703int WorkerThread::CheckRegion(void *addr,
704 class Pattern *pattern,
705 int64 length,
706 int offset,
707 int64 pattern_offset) {
708 uint64 *memblock = static_cast<uint64*>(addr);
709 const int kErrorLimit = 128;
710 int errors = 0;
711 int overflowerrors = 0; // Count of overflowed errors.
712 bool page_error = false;
713 string errormessage("Hardware Error");
714 struct ErrorRecord
715 recorded[kErrorLimit]; // Queued errors for later printing.
716
717 // For each word in the data region.
718 for (int i = 0; i < length / wordsize_; i++) {
719 uint64 actual = memblock[i];
720 uint64 expected;
721
722 // Determine the value that should be there.
723 datacast_t data;
724 int index = 2 * i + pattern_offset;
725 data.l32.l = pattern->pattern(index);
726 data.l32.h = pattern->pattern(index + 1);
727 expected = data.l64;
728 // Check tags if necessary.
729 if (tag_mode_ && ((reinterpret_cast<uint64>(&memblock[i]) & 0x3f) == 0)) {
730 expected = addr_to_tag(&memblock[i]);
731 }
732
733
734 // If the value is incorrect, save an error record for later printing.
735 if (actual != expected) {
736 if (errors < kErrorLimit) {
737 recorded[errors].actual = actual;
738 recorded[errors].expected = expected;
739 recorded[errors].vaddr = &memblock[i];
740 errors++;
741 } else {
742 page_error = true;
743 // If we have overflowed the error queue, just print the errors now.
744 logprintf(10, "Log: Error record overflow, too many miscompares!\n");
745 errormessage = "Page Error";
746 break;
747 }
748 }
749 }
750
751 // Find if this is a whole block corruption.
752 if (page_error && !tag_mode_) {
753 int patsize = patternlist_->Size();
754 for (int pat = 0; pat < patsize; pat++) {
755 class Pattern *altpattern = patternlist_->GetPattern(pat);
756 const int kGood = 0;
757 const int kBad = 1;
758 const int kGoodAgain = 2;
759 const int kNoMatch = 3;
760 int state = kGood;
761 unsigned int badstart = 0;
762 unsigned int badend = 0;
763
764 // Don't match against ourself!
765 if (pattern == altpattern)
766 continue;
767
768 for (int i = 0; i < length / wordsize_; i++) {
769 uint64 actual = memblock[i];
770 datacast_t expected;
771 datacast_t possible;
772
773 // Determine the value that should be there.
774 int index = 2 * i + pattern_offset;
775
776 expected.l32.l = pattern->pattern(index);
777 expected.l32.h = pattern->pattern(index + 1);
778
779 possible.l32.l = pattern->pattern(index);
780 possible.l32.h = pattern->pattern(index + 1);
781
782 if (state == kGood) {
783 if (actual == expected.l64) {
784 continue;
785 } else if (actual == possible.l64) {
786 badstart = i;
787 badend = i;
788 state = kBad;
789 continue;
790 } else {
791 state = kNoMatch;
792 break;
793 }
794 } else if (state == kBad) {
795 if (actual == possible.l64) {
796 badend = i;
797 continue;
798 } else if (actual == expected.l64) {
799 state = kGoodAgain;
800 continue;
801 } else {
802 state = kNoMatch;
803 break;
804 }
805 } else if (state == kGoodAgain) {
806 if (actual == expected.l64) {
807 continue;
808 } else {
809 state = kNoMatch;
810 break;
811 }
812 }
813 }
814
815 if ((state == kGoodAgain) || (state == kBad)) {
816 unsigned int blockerrors = badend - badstart + 1;
817 errormessage = "Block Error";
818 ProcessError(&recorded[0], 0, errormessage.c_str());
819 logprintf(0, "Block Error: (%p) pattern %s instead of %s, "
820 "%d bytes from offset 0x%x to 0x%x\n",
821 &memblock[badstart],
822 altpattern->name(), pattern->name(),
823 blockerrors * wordsize_,
824 offset + badstart * wordsize_,
825 offset + badend * wordsize_);
826 errorcount_ += blockerrors;
827 return blockerrors;
828 }
829 }
830 }
831
832
833 // Process error queue after all errors have been recorded.
834 for (int err = 0; err < errors; err++) {
835 int priority = 5;
836 if (errorcount_ + err < 30)
837 priority = 0; // Bump up the priority for the first few errors.
838 ProcessError(&recorded[err], priority, errormessage.c_str());
839 }
840
841 if (page_error) {
842 // For each word in the data region.
843 int error_recount = 0;
844 for (int i = 0; i < length / wordsize_; i++) {
845 uint64 actual = memblock[i];
846 uint64 expected;
847 datacast_t data;
848 // Determine the value that should be there.
849 int index = 2 * i + pattern_offset;
850
851 data.l32.l = pattern->pattern(index);
852 data.l32.h = pattern->pattern(index + 1);
853 expected = data.l64;
854
855 // Check tags if necessary.
856 if (tag_mode_ && ((reinterpret_cast<uint64>(&memblock[i]) & 0x3f) == 0)) {
857 expected = addr_to_tag(&memblock[i]);
858 }
859
860 // If the value is incorrect, save an error record for later printing.
861 if (actual != expected) {
862 if (error_recount < kErrorLimit) {
863 // We already reported these.
864 error_recount++;
865 } else {
866 // If we have overflowed the error queue, print the errors now.
867 struct ErrorRecord er;
868 er.actual = actual;
869 er.expected = expected;
870 er.vaddr = &memblock[i];
871
872 // Do the error printout. This will take a long time and
873 // likely change the machine state.
874 ProcessError(&er, 12, errormessage.c_str());
875 overflowerrors++;
876 }
877 }
878 }
879 }
880
881 // Keep track of observed errors.
882 errorcount_ += errors + overflowerrors;
883 return errors + overflowerrors;
884}
885
886float WorkerThread::GetCopiedData() {
887 return pages_copied_ * sat_->page_length() / kMegabyte;
888}
889
890// Calculate the CRC of a region.
891// Result check if the CRC mismatches.
892int WorkerThread::CrcCheckPage(struct page_entry *srcpe) {
893 const int blocksize = 4096;
894 const int blockwords = blocksize / wordsize_;
895 int errors = 0;
896
897 const AdlerChecksum *expectedcrc = srcpe->pattern->crc();
898 uint64 *memblock = static_cast<uint64*>(srcpe->addr);
899 int blocks = sat_->page_length() / blocksize;
900 for (int currentblock = 0; currentblock < blocks; currentblock++) {
901 uint64 *memslice = memblock + currentblock * blockwords;
902
903 AdlerChecksum crc;
904 if (tag_mode_) {
905 AdlerAddrCrcC(memslice, blocksize, &crc, srcpe);
906 } else {
907 CalculateAdlerChecksum(memslice, blocksize, &crc);
908 }
909
910 // If the CRC does not match, we'd better look closer.
911 if (!crc.Equals(*expectedcrc)) {
912 logprintf(11, "Log: CrcCheckPage Falling through to slow compare, "
913 "CRC mismatch %s != %s\n",
914 crc.ToHexString().c_str(),
915 expectedcrc->ToHexString().c_str());
916 int errorcount = CheckRegion(memslice,
917 srcpe->pattern,
918 blocksize,
919 currentblock * blocksize, 0);
920 if (errorcount == 0) {
921 logprintf(0, "Log: CrcCheckPage CRC mismatch %s != %s, "
922 "but no miscompares found.\n",
923 crc.ToHexString().c_str(),
924 expectedcrc->ToHexString().c_str());
925 }
926 errors += errorcount;
927 }
928 }
929
930 // For odd length transfers, we should never hit this.
931 int leftovers = sat_->page_length() % blocksize;
932 if (leftovers) {
933 uint64 *memslice = memblock + blocks * blockwords;
934 errors += CheckRegion(memslice,
935 srcpe->pattern,
936 leftovers,
937 blocks * blocksize, 0);
938 }
939 return errors;
940}
941
942
943// Print error information about a data miscompare.
944void WorkerThread::ProcessTagError(struct ErrorRecord *error,
945 int priority,
946 const char *message) {
947 char dimm_string[256] = "";
948 char tag_dimm_string[256] = "";
949 bool read_error = false;
950
951 int apic_id = apicid();
952
953 // Determine if this is a write or read error.
954 os_->Flush(error->vaddr);
955 error->reread = *(error->vaddr);
956
957 // Distinguish read and write errors.
958 if (error->actual != error->reread) {
959 read_error = true;
960 }
961
962 sat_assert(error->expected != error->actual);
963
964 error->vbyteaddr = reinterpret_cast<char*>(error->vaddr);
965
966 // Find physical address if possible.
967 error->paddr = os_->VirtualToPhysical(error->vbyteaddr);
968 error->tagpaddr = os_->VirtualToPhysical(error->tagvaddr);
969
970 // Pretty print DIMM mapping if available.
971 os_->FindDimm(error->paddr, dimm_string, sizeof(dimm_string));
972 // Pretty print DIMM mapping if available.
973 os_->FindDimm(error->tagpaddr, tag_dimm_string, sizeof(tag_dimm_string));
974
975 // Report parseable error.
976 if (priority < 5) {
977 logprintf(priority,
978 "%s: Tag from %p(0x%llx:%s) (%s) "
979 "miscompare on CPU %d(0x%s) at %p(0x%llx:%s): "
980 "read:0x%016llx, reread:0x%016llx expected:0x%016llx\n",
981 message,
982 error->tagvaddr, error->tagpaddr,
983 tag_dimm_string,
984 read_error ? "read error" : "write error",
985 apic_id,
986 CurrentCpusFormat().c_str(),
987 error->vaddr,
988 error->paddr,
989 dimm_string,
990 error->actual,
991 error->reread,
992 error->expected);
993 }
994
995 errorcount_ += 1;
996
997 // Overwrite incorrect data with correct data to prevent
998 // future miscompares when this data is reused.
999 *(error->vaddr) = error->expected;
1000 os_->Flush(error->vaddr);
1001}
1002
1003
1004// Print out and log a tag error.
1005bool WorkerThread::ReportTagError(
1006 uint64 *mem64,
1007 uint64 actual,
1008 uint64 tag) {
1009 struct ErrorRecord er;
1010 er.actual = actual;
1011
1012 er.expected = tag;
1013 er.vaddr = mem64;
1014
1015 // Generate vaddr from tag.
1016 er.tagvaddr = reinterpret_cast<uint64*>(actual);
1017
1018 ProcessTagError(&er, 0, "Hardware Error");
1019 return true;
1020}
1021
1022// C implementation of Adler memory copy, with memory tagging.
1023bool WorkerThread::AdlerAddrMemcpyC(uint64 *dstmem64,
1024 uint64 *srcmem64,
1025 unsigned int size_in_bytes,
1026 AdlerChecksum *checksum,
1027 struct page_entry *pe) {
1028 // Use this data wrapper to access memory with 64bit read/write.
1029 datacast_t data;
1030 datacast_t dstdata;
1031 unsigned int count = size_in_bytes / sizeof(data);
1032
1033 if (count > ((1U) << 19)) {
1034 // Size is too large, must be strictly less than 512 KB.
1035 return false;
1036 }
1037
1038 uint64 a1 = 1;
1039 uint64 a2 = 1;
1040 uint64 b1 = 0;
1041 uint64 b2 = 0;
1042
1043 class Pattern *pattern = pe->pattern;
1044
1045 unsigned int i = 0;
1046 while (i < count) {
1047 // Process 64 bits at a time.
1048 if ((i & 0x7) == 0) {
1049 data.l64 = srcmem64[i];
1050 dstdata.l64 = dstmem64[i];
1051 uint64 src_tag = addr_to_tag(&srcmem64[i]);
1052 uint64 dst_tag = addr_to_tag(&dstmem64[i]);
1053 // Detect if tags have been corrupted.
1054 if (data.l64 != src_tag)
1055 ReportTagError(&srcmem64[i], data.l64, src_tag);
1056 if (dstdata.l64 != dst_tag)
1057 ReportTagError(&dstmem64[i], dstdata.l64, dst_tag);
1058
1059 data.l32.l = pattern->pattern(i << 1);
1060 data.l32.h = pattern->pattern((i << 1) + 1);
1061 a1 = a1 + data.l32.l;
1062 b1 = b1 + a1;
1063 a1 = a1 + data.l32.h;
1064 b1 = b1 + a1;
1065
1066 data.l64 = dst_tag;
1067 dstmem64[i] = data.l64;
1068
1069 } else {
1070 data.l64 = srcmem64[i];
1071 a1 = a1 + data.l32.l;
1072 b1 = b1 + a1;
1073 a1 = a1 + data.l32.h;
1074 b1 = b1 + a1;
1075 dstmem64[i] = data.l64;
1076 }
1077 i++;
1078
1079 data.l64 = srcmem64[i];
1080 a2 = a2 + data.l32.l;
1081 b2 = b2 + a2;
1082 a2 = a2 + data.l32.h;
1083 b2 = b2 + a2;
1084 dstmem64[i] = data.l64;
1085 i++;
1086 }
1087 checksum->Set(a1, a2, b1, b2);
1088 return true;
1089}
1090
1091// x86_64 SSE2 assembly implementation of Adler memory copy, with address
1092// tagging added as a second step. This is useful for debugging failures
1093// that only occur when SSE / nontemporal writes are used.
1094bool WorkerThread::AdlerAddrMemcpyWarm(uint64 *dstmem64,
1095 uint64 *srcmem64,
1096 unsigned int size_in_bytes,
1097 AdlerChecksum *checksum,
1098 struct page_entry *pe) {
1099 // Do ASM copy, ignore checksum.
1100 AdlerChecksum ignored_checksum;
1101 os_->AdlerMemcpyWarm(dstmem64, srcmem64, size_in_bytes, &ignored_checksum);
1102
1103 // Force cache flush.
1104 int length = size_in_bytes / sizeof(*dstmem64);
1105 for (int i = 0; i < length; i += sizeof(*dstmem64)) {
1106 os_->FastFlush(dstmem64 + i);
1107 os_->FastFlush(srcmem64 + i);
1108 }
1109 // Check results.
1110 AdlerAddrCrcC(srcmem64, size_in_bytes, checksum, pe);
1111 // Patch up address tags.
1112 TagAddrC(dstmem64, size_in_bytes);
1113 return true;
1114}
1115
1116// Retag pages..
1117bool WorkerThread::TagAddrC(uint64 *memwords,
1118 unsigned int size_in_bytes) {
1119 // Mask is the bitmask of indexes used by the pattern.
1120 // It is the pattern size -1. Size is always a power of 2.
1121
1122 // Select tag or data as appropriate.
1123 int length = size_in_bytes / wordsize_;
1124 for (int i = 0; i < length; i += 8) {
1125 datacast_t data;
1126 data.l64 = addr_to_tag(&memwords[i]);
1127 memwords[i] = data.l64;
1128 }
1129 return true;
1130}
1131
1132// C implementation of Adler memory crc.
1133bool WorkerThread::AdlerAddrCrcC(uint64 *srcmem64,
1134 unsigned int size_in_bytes,
1135 AdlerChecksum *checksum,
1136 struct page_entry *pe) {
1137 // Use this data wrapper to access memory with 64bit read/write.
1138 datacast_t data;
1139 unsigned int count = size_in_bytes / sizeof(data);
1140
1141 if (count > ((1U) << 19)) {
1142 // Size is too large, must be strictly less than 512 KB.
1143 return false;
1144 }
1145
1146 uint64 a1 = 1;
1147 uint64 a2 = 1;
1148 uint64 b1 = 0;
1149 uint64 b2 = 0;
1150
1151 class Pattern *pattern = pe->pattern;
1152
1153 unsigned int i = 0;
1154 while (i < count) {
1155 // Process 64 bits at a time.
1156 if ((i & 0x7) == 0) {
1157 data.l64 = srcmem64[i];
1158 uint64 src_tag = addr_to_tag(&srcmem64[i]);
1159 // Check that tags match expected.
1160 if (data.l64 != src_tag)
1161 ReportTagError(&srcmem64[i], data.l64, src_tag);
1162
1163 data.l32.l = pattern->pattern(i << 1);
1164 data.l32.h = pattern->pattern((i << 1) + 1);
1165 a1 = a1 + data.l32.l;
1166 b1 = b1 + a1;
1167 a1 = a1 + data.l32.h;
1168 b1 = b1 + a1;
1169 } else {
1170 data.l64 = srcmem64[i];
1171 a1 = a1 + data.l32.l;
1172 b1 = b1 + a1;
1173 a1 = a1 + data.l32.h;
1174 b1 = b1 + a1;
1175 }
1176 i++;
1177
1178 data.l64 = srcmem64[i];
1179 a2 = a2 + data.l32.l;
1180 b2 = b2 + a2;
1181 a2 = a2 + data.l32.h;
1182 b2 = b2 + a2;
1183 i++;
1184 }
1185 checksum->Set(a1, a2, b1, b2);
1186 return true;
1187}
1188
1189// Copy a block of memory quickly, while keeping a CRC of the data.
1190// Result check if the CRC mismatches.
1191int WorkerThread::CrcCopyPage(struct page_entry *dstpe,
1192 struct page_entry *srcpe) {
1193 int errors = 0;
1194 const int blocksize = 4096;
1195 const int blockwords = blocksize / wordsize_;
1196 int blocks = sat_->page_length() / blocksize;
1197
1198 // Base addresses for memory copy
1199 uint64 *targetmembase = static_cast<uint64*>(dstpe->addr);
1200 uint64 *sourcemembase = static_cast<uint64*>(srcpe->addr);
1201 // Remember the expected CRC
1202 const AdlerChecksum *expectedcrc = srcpe->pattern->crc();
1203
1204 for (int currentblock = 0; currentblock < blocks; currentblock++) {
1205 uint64 *targetmem = targetmembase + currentblock * blockwords;
1206 uint64 *sourcemem = sourcemembase + currentblock * blockwords;
1207
1208 AdlerChecksum crc;
1209 if (tag_mode_) {
1210 AdlerAddrMemcpyC(targetmem, sourcemem, blocksize, &crc, srcpe);
1211 } else {
1212 AdlerMemcpyC(targetmem, sourcemem, blocksize, &crc);
1213 }
1214
1215 // Investigate miscompares.
1216 if (!crc.Equals(*expectedcrc)) {
1217 logprintf(11, "Log: CrcCopyPage Falling through to slow compare, "
1218 "CRC mismatch %s != %s\n", crc.ToHexString().c_str(),
1219 expectedcrc->ToHexString().c_str());
1220 int errorcount = CheckRegion(sourcemem,
1221 srcpe->pattern,
1222 blocksize,
1223 currentblock * blocksize, 0);
1224 if (errorcount == 0) {
1225 logprintf(0, "Log: CrcCopyPage CRC mismatch %s != %s, "
1226 "but no miscompares found. Retrying with fresh data.\n",
1227 crc.ToHexString().c_str(),
1228 expectedcrc->ToHexString().c_str());
1229 if (!tag_mode_) {
1230 // Copy the data originally read from this region back again.
1231 // This data should have any corruption read originally while
1232 // calculating the CRC.
1233 memcpy(sourcemem, targetmem, blocksize);
1234 errorcount = CheckRegion(sourcemem,
1235 srcpe->pattern,
1236 blocksize,
1237 currentblock * blocksize, 0);
1238 if (errorcount == 0) {
1239 int apic_id = apicid();
1240 logprintf(0, "Process Error: CPU %d(0x%s) CrcCopyPage "
1241 "CRC mismatch %s != %s, "
1242 "but no miscompares found on second pass.\n",
1243 apic_id, CurrentCpusFormat().c_str(),
1244 crc.ToHexString().c_str(),
1245 expectedcrc->ToHexString().c_str());
1246 struct ErrorRecord er;
1247 er.actual = sourcemem[0];
1248 er.expected = 0x0;
1249 er.vaddr = sourcemem;
1250 ProcessError(&er, 0, "Hardware Error");
1251 }
1252 }
1253 }
1254 errors += errorcount;
1255 }
1256 }
1257
1258 // For odd length transfers, we should never hit this.
1259 int leftovers = sat_->page_length() % blocksize;
1260 if (leftovers) {
1261 uint64 *targetmem = targetmembase + blocks * blockwords;
1262 uint64 *sourcemem = sourcemembase + blocks * blockwords;
1263
1264 errors += CheckRegion(sourcemem,
1265 srcpe->pattern,
1266 leftovers,
1267 blocks * blocksize, 0);
1268 int leftoverwords = leftovers / wordsize_;
1269 for (int i = 0; i < leftoverwords; i++) {
1270 targetmem[i] = sourcemem[i];
1271 }
1272 }
1273
1274 // Update pattern reference to reflect new contents.
1275 dstpe->pattern = srcpe->pattern;
1276
1277 // Clean clean clean the errors away.
1278 if (errors) {
1279 // TODO(nsanders): Maybe we should patch rather than fill? Filling may
1280 // cause bad data to be propogated across the page.
1281 FillPage(dstpe);
1282 }
1283 return errors;
1284}
1285
1286
1287
1288// Invert a block of memory quickly, traversing downwards.
1289int InvertThread::InvertPageDown(struct page_entry *srcpe) {
1290 const int blocksize = 4096;
1291 const int blockwords = blocksize / wordsize_;
1292 int blocks = sat_->page_length() / blocksize;
1293
1294 // Base addresses for memory copy
1295 unsigned int *sourcemembase = static_cast<unsigned int *>(srcpe->addr);
1296
1297 for (int currentblock = blocks-1; currentblock >= 0; currentblock--) {
1298 unsigned int *sourcemem = sourcemembase + currentblock * blockwords;
1299 for (int i = blockwords - 32; i >= 0; i -= 32) {
1300 for (int index = i + 31; index >= i; --index) {
1301 unsigned int actual = sourcemem[index];
1302 sourcemem[index] = ~actual;
1303 }
1304 OsLayer::FastFlush(&sourcemem[i]);
1305 }
1306 }
1307
1308 return 0;
1309}
1310
1311// Invert a block of memory, traversing upwards.
1312int InvertThread::InvertPageUp(struct page_entry *srcpe) {
1313 const int blocksize = 4096;
1314 const int blockwords = blocksize / wordsize_;
1315 int blocks = sat_->page_length() / blocksize;
1316
1317 // Base addresses for memory copy
1318 unsigned int *sourcemembase = static_cast<unsigned int *>(srcpe->addr);
1319
1320 for (int currentblock = 0; currentblock < blocks; currentblock++) {
1321 unsigned int *sourcemem = sourcemembase + currentblock * blockwords;
1322 for (int i = 0; i < blockwords; i += 32) {
1323 for (int index = i; index <= i + 31; ++index) {
1324 unsigned int actual = sourcemem[index];
1325 sourcemem[index] = ~actual;
1326 }
1327 OsLayer::FastFlush(&sourcemem[i]);
1328 }
1329 }
1330 return 0;
1331}
1332
1333// Copy a block of memory quickly, while keeping a CRC of the data.
1334// Result check if the CRC mismatches. Warm the CPU while running
1335int WorkerThread::CrcWarmCopyPage(struct page_entry *dstpe,
1336 struct page_entry *srcpe) {
1337 int errors = 0;
1338 const int blocksize = 4096;
1339 const int blockwords = blocksize / wordsize_;
1340 int blocks = sat_->page_length() / blocksize;
1341
1342 // Base addresses for memory copy
1343 uint64 *targetmembase = static_cast<uint64*>(dstpe->addr);
1344 uint64 *sourcemembase = static_cast<uint64*>(srcpe->addr);
1345 // Remember the expected CRC
1346 const AdlerChecksum *expectedcrc = srcpe->pattern->crc();
1347
1348 for (int currentblock = 0; currentblock < blocks; currentblock++) {
1349 uint64 *targetmem = targetmembase + currentblock * blockwords;
1350 uint64 *sourcemem = sourcemembase + currentblock * blockwords;
1351
1352 AdlerChecksum crc;
1353 if (tag_mode_) {
1354 AdlerAddrMemcpyWarm(targetmem, sourcemem, blocksize, &crc, srcpe);
1355 } else {
1356 os_->AdlerMemcpyWarm(targetmem, sourcemem, blocksize, &crc);
1357 }
1358
1359 // Investigate miscompares.
1360 if (!crc.Equals(*expectedcrc)) {
1361 logprintf(11, "Log: CrcWarmCopyPage Falling through to slow compare, "
1362 "CRC mismatch %s != %s\n", crc.ToHexString().c_str(),
1363 expectedcrc->ToHexString().c_str());
1364 int errorcount = CheckRegion(sourcemem,
1365 srcpe->pattern,
1366 blocksize,
1367 currentblock * blocksize, 0);
1368 if (errorcount == 0) {
1369 logprintf(0, "Log: CrcWarmCopyPage CRC mismatch %s != %s, "
1370 "but no miscompares found. Retrying with fresh data.\n",
1371 crc.ToHexString().c_str(),
1372 expectedcrc->ToHexString().c_str());
1373 if (!tag_mode_) {
1374 // Copy the data originally read from this region back again.
1375 // This data should have any corruption read originally while
1376 // calculating the CRC.
1377 memcpy(sourcemem, targetmem, blocksize);
1378 errorcount = CheckRegion(sourcemem,
1379 srcpe->pattern,
1380 blocksize,
1381 currentblock * blocksize, 0);
1382 if (errorcount == 0) {
1383 int apic_id = apicid();
1384 logprintf(0, "Process Error: CPU %d(0x%s) CrciWarmCopyPage "
1385 "CRC mismatch %s != %s, "
1386 "but no miscompares found on second pass.\n",
1387 apic_id, CurrentCpusFormat().c_str(),
1388 crc.ToHexString().c_str(),
1389 expectedcrc->ToHexString().c_str());
1390 struct ErrorRecord er;
1391 er.actual = sourcemem[0];
1392 er.expected = 0x0;
1393 er.vaddr = sourcemem;
1394 ProcessError(&er, 0, "Hardware Error");
1395 }
1396 }
1397 }
1398 errors += errorcount;
1399 }
1400 }
1401
1402 // For odd length transfers, we should never hit this.
1403 int leftovers = sat_->page_length() % blocksize;
1404 if (leftovers) {
1405 uint64 *targetmem = targetmembase + blocks * blockwords;
1406 uint64 *sourcemem = sourcemembase + blocks * blockwords;
1407
1408 errors += CheckRegion(sourcemem,
1409 srcpe->pattern,
1410 leftovers,
1411 blocks * blocksize, 0);
1412 int leftoverwords = leftovers / wordsize_;
1413 for (int i = 0; i < leftoverwords; i++) {
1414 targetmem[i] = sourcemem[i];
1415 }
1416 }
1417
1418 // Update pattern reference to reflect new contents.
1419 dstpe->pattern = srcpe->pattern;
1420
1421 // Clean clean clean the errors away.
1422 if (errors) {
1423 // TODO(nsanders): Maybe we should patch rather than fill? Filling may
1424 // cause bad data to be propogated across the page.
1425 FillPage(dstpe);
1426 }
1427 return errors;
1428}
1429
1430
1431
1432// Memory check work loop. Execute until done, then exhaust pages.
1433bool CheckThread::Work() {
1434 struct page_entry pe;
1435 bool result = true;
1436 int64 loops = 0;
1437
1438 logprintf(9, "Log: Starting Check thread %d\n", thread_num_);
1439
1440 // We want to check all the pages, and
1441 // stop when there aren't any left.
1442 while (true) {
1443 result = result && sat_->GetValid(&pe);
1444 if (!result) {
1445 if (IsReadyToRunNoPause())
1446 logprintf(0, "Process Error: check_thread failed to pop pages, "
1447 "bailing\n");
1448 else
1449 result = true;
1450 break;
1451 }
1452
1453 // Do the result check.
1454 CrcCheckPage(&pe);
1455
1456 // Push pages back on the valid queue if we are still going,
1457 // throw them out otherwise.
1458 if (IsReadyToRunNoPause())
1459 result = result && sat_->PutValid(&pe);
1460 else
1461 result = result && sat_->PutEmpty(&pe);
1462 if (!result) {
1463 logprintf(0, "Process Error: check_thread failed to push pages, "
1464 "bailing\n");
1465 break;
1466 }
1467 loops++;
1468 }
1469
1470 pages_copied_ = loops;
1471 status_ = result;
1472 logprintf(9, "Log: Completed %d: Check thread. Status %d, %d pages checked\n",
1473 thread_num_, status_, pages_copied_);
1474 return result;
1475}
1476
1477
1478// Memory copy work loop. Execute until marked done.
1479bool CopyThread::Work() {
1480 struct page_entry src;
1481 struct page_entry dst;
1482 bool result = true;
1483 int64 loops = 0;
1484
1485 logprintf(9, "Log: Starting copy thread %d: cpu %s, mem %x\n",
1486 thread_num_, cpuset_format(&cpu_mask_).c_str(), tag_);
1487
1488 while (IsReadyToRun()) {
1489 // Pop the needed pages.
1490 result = result && sat_->GetValid(&src, tag_);
1491 result = result && sat_->GetEmpty(&dst, tag_);
1492 if (!result) {
1493 logprintf(0, "Process Error: copy_thread failed to pop pages, "
1494 "bailing\n");
1495 break;
1496 }
1497
1498 // Force errors for unittests.
1499 if (sat_->error_injection()) {
1500 if (loops == 8) {
1501 char *addr = reinterpret_cast<char*>(src.addr);
1502 int offset = random() % sat_->page_length();
1503 addr[offset] = 0xba;
1504 }
1505 }
1506
1507 // We can use memcpy, or CRC check while we copy.
1508 if (sat_->warm()) {
1509 CrcWarmCopyPage(&dst, &src);
1510 } else if (sat_->strict()) {
1511 CrcCopyPage(&dst, &src);
1512 } else {
1513 memcpy(dst.addr, src.addr, sat_->page_length());
1514 dst.pattern = src.pattern;
1515 }
1516
1517 result = result && sat_->PutValid(&dst);
1518 result = result && sat_->PutEmpty(&src);
1519
1520 // Copy worker-threads yield themselves at the end of each copy loop,
1521 // to avoid threads from preempting each other in the middle of the inner
1522 // copy-loop. Cooperations between Copy worker-threads results in less
1523 // unnecessary cache thrashing (which happens when context-switching in the
1524 // middle of the inner copy-loop).
1525 YieldSelf();
1526
1527 if (!result) {
1528 logprintf(0, "Process Error: copy_thread failed to push pages, "
1529 "bailing\n");
1530 break;
1531 }
1532 loops++;
1533 }
1534
1535 pages_copied_ = loops;
1536 status_ = result;
1537 logprintf(9, "Log: Completed %d: Copy thread. Status %d, %d pages copied\n",
1538 thread_num_, status_, pages_copied_);
1539 return result;
1540}
1541
1542// Memory invert work loop. Execute until marked done.
1543bool InvertThread::Work() {
1544 struct page_entry src;
1545 bool result = true;
1546 int64 loops = 0;
1547
1548 logprintf(9, "Log: Starting invert thread %d\n", thread_num_);
1549
1550 while (IsReadyToRun()) {
1551 // Pop the needed pages.
1552 result = result && sat_->GetValid(&src);
1553 if (!result) {
1554 logprintf(0, "Process Error: invert_thread failed to pop pages, "
1555 "bailing\n");
1556 break;
1557 }
1558
1559 if (sat_->strict())
1560 CrcCheckPage(&src);
1561
1562 // For the same reason CopyThread yields itself (see YieldSelf comment
1563 // in CopyThread::Work(), InvertThread yields itself after each invert
1564 // operation to improve cooperation between different worker threads
1565 // stressing the memory/cache.
1566 InvertPageUp(&src);
1567 YieldSelf();
1568 InvertPageDown(&src);
1569 YieldSelf();
1570 InvertPageDown(&src);
1571 YieldSelf();
1572 InvertPageUp(&src);
1573 YieldSelf();
1574
1575 if (sat_->strict())
1576 CrcCheckPage(&src);
1577
1578 result = result && sat_->PutValid(&src);
1579 if (!result) {
1580 logprintf(0, "Process Error: invert_thread failed to push pages, "
1581 "bailing\n");
1582 break;
1583 }
1584 loops++;
1585 }
1586
1587 pages_copied_ = loops * 2;
1588 status_ = result;
1589 logprintf(9, "Log: Completed %d: Copy thread. Status %d, %d pages copied\n",
1590 thread_num_, status_, pages_copied_);
1591 return result;
1592}
1593
1594
1595// Set file name to use for File IO.
1596void FileThread::SetFile(const char *filename_init) {
1597 filename_ = filename_init;
1598 devicename_ = os_->FindFileDevice(filename_);
1599}
1600
1601// Open the file for access.
1602bool FileThread::OpenFile(int *pfile) {
1603 int fd = open(filename_.c_str(),
1604 O_RDWR | O_CREAT | O_SYNC | O_DIRECT,
1605 0644);
1606 if (fd < 0) {
1607 logprintf(0, "Process Error: Failed to create file %s!!\n",
1608 filename_.c_str());
1609 pages_copied_ = 0;
1610 return false;
1611 }
1612 *pfile = fd;
1613 return true;
1614}
1615
1616// Close the file.
1617bool FileThread::CloseFile(int fd) {
1618 close(fd);
1619 return true;
1620}
1621
1622// Check sector tagging.
1623bool FileThread::SectorTagPage(struct page_entry *src, int block) {
1624 int page_length = sat_->page_length();
1625 struct FileThread::SectorTag *tag =
1626 (struct FileThread::SectorTag *)(src->addr);
1627
1628 // Tag each sector.
1629 unsigned char magic = ((0xba + thread_num_) & 0xff);
1630 for (int sec = 0; sec < page_length / 512; sec++) {
1631 tag[sec].magic = magic;
1632 tag[sec].block = block & 0xff;
1633 tag[sec].sector = sec & 0xff;
1634 tag[sec].pass = pass_ & 0xff;
1635 }
1636 return true;
1637}
1638
1639bool FileThread::WritePageToFile(int fd, struct page_entry *src) {
1640 int page_length = sat_->page_length();
1641 // Fill the file with our data.
1642 int64 size = write(fd, src->addr, page_length);
1643
1644 if (size != page_length) {
1645 os_->ErrorReport(devicename_.c_str(), "write-error", 1);
1646 errorcount_++;
1647 logprintf(0, "Block Error: file_thread failed to write, "
1648 "bailing\n");
1649 return false;
1650 }
1651 return true;
1652}
1653
1654// Write the data to the file.
1655bool FileThread::WritePages(int fd) {
1656 int strict = sat_->strict();
1657
1658 // Start fresh at beginning of file for each batch of pages.
1659 lseek64(fd, 0, SEEK_SET);
1660 for (int i = 0; i < sat_->disk_pages(); i++) {
1661 struct page_entry src;
1662 if (!GetValidPage(&src))
1663 return false;
1664 // Save expected pattern.
1665 page_recs_[i].pattern = src.pattern;
1666 page_recs_[i].src = src.addr;
1667
1668 // Check data correctness.
1669 if (strict)
1670 CrcCheckPage(&src);
1671
1672 SectorTagPage(&src, i);
1673
1674 bool result = WritePageToFile(fd, &src);
1675
1676 if (!PutEmptyPage(&src))
1677 return false;
1678
1679 if (!result)
1680 return false;
1681 }
1682 return true;
1683}
1684
1685// Copy data from file into memory block.
1686bool FileThread::ReadPageFromFile(int fd, struct page_entry *dst) {
1687 int page_length = sat_->page_length();
1688
1689 // Do the actual read.
1690 int64 size = read(fd, dst->addr, page_length);
1691 if (size != page_length) {
1692 os_->ErrorReport(devicename_.c_str(), "read-error", 1);
1693 logprintf(0, "Block Error: file_thread failed to read, "
1694 "bailing\n");
1695 errorcount_++;
1696 return false;
1697 }
1698 return true;
1699}
1700
1701// Check sector tagging.
1702bool FileThread::SectorValidatePage(const struct PageRec &page,
1703 struct page_entry *dst, int block) {
1704 // Error injection.
1705 static int calls = 0;
1706 calls++;
1707
1708 // Do sector tag compare.
1709 int firstsector = -1;
1710 int lastsector = -1;
1711 bool badsector = false;
1712 int page_length = sat_->page_length();
1713
1714 // Cast data block into an array of tagged sectors.
1715 struct FileThread::SectorTag *tag =
1716 (struct FileThread::SectorTag *)(dst->addr);
1717
1718 sat_assert(sizeof(*tag) == 512);
1719
1720 // Error injection.
1721 if (sat_->error_injection()) {
1722 if (calls == 2) {
1723 for (int badsec = 8; badsec < 17; badsec++)
1724 tag[badsec].pass = 27;
1725 }
1726 if (calls == 18) {
1727 (static_cast<int32*>(dst->addr))[27] = 0xbadda7a;
1728 }
1729 }
1730
1731 // Check each sector for the correct tag we added earlier,
1732 // then revert the tag to the to normal data pattern.
1733 unsigned char magic = ((0xba + thread_num_) & 0xff);
1734 for (int sec = 0; sec < page_length / 512; sec++) {
1735 // Check magic tag.
1736 if ((tag[sec].magic != magic) ||
1737 (tag[sec].block != (block & 0xff)) ||
1738 (tag[sec].sector != (sec & 0xff)) ||
1739 (tag[sec].pass != (pass_ & 0xff))) {
1740 // Offset calculation for tag location.
1741 int offset = sec * sizeof(SectorTag);
1742 if (tag[sec].block != (block & 0xff))
1743 offset += 1 * sizeof(uint8);
1744 else if (tag[sec].sector != (sec & 0xff))
1745 offset += 2 * sizeof(uint8);
1746 else if (tag[sec].pass != (pass_ & 0xff))
1747 offset += 3 * sizeof(uint8);
1748
1749 // Run sector tag error through diagnoser for logging and reporting.
1750 errorcount_ += 1;
1751 os_->error_diagnoser_->AddHDDSectorTagError(devicename_, tag[sec].block,
1752 offset,
1753 tag[sec].sector,
1754 page.src, page.dst);
1755
1756 logprintf(5, "Sector Error: Sector tag @ 0x%x, pass %d/%d. "
1757 "sec %x/%x, block %d/%d, magic %x/%x, File: %s \n",
1758 block * page_length + 512 * sec,
1759 (pass_ & 0xff), (unsigned int)tag[sec].pass,
1760 sec, (unsigned int)tag[sec].sector,
1761 block, (unsigned int)tag[sec].block,
1762 magic, (unsigned int)tag[sec].magic,
1763 filename_.c_str());
1764
1765 // Keep track of first and last bad sector.
1766 if (firstsector == -1)
1767 firstsector = (block * page_length / 512) + sec;
1768 lastsector = (block * page_length / 512) + sec;
1769 badsector = true;
1770 }
1771 // Patch tag back to proper pattern.
1772 unsigned int *addr = (unsigned int *)(&tag[sec]);
1773 *addr = dst->pattern->pattern(512 * sec / sizeof(*addr));
1774 }
1775
1776 // If we found sector errors:
1777 if (badsector == true) {
1778 logprintf(5, "Log: file sector miscompare at offset %x-%x. File: %s\n",
1779 firstsector * 512,
1780 ((lastsector + 1) * 512) - 1,
1781 filename_.c_str());
1782
1783 // Either exit immediately, or patch the data up and continue.
1784 if (sat_->stop_on_error()) {
1785 exit(1);
1786 } else {
1787 // Patch up bad pages.
1788 for (int block = (firstsector * 512) / page_length;
1789 block <= (lastsector * 512) / page_length;
1790 block++) {
1791 unsigned int *memblock = static_cast<unsigned int *>(dst->addr);
1792 int length = page_length / wordsize_;
1793 for (int i = 0; i < length; i++) {
1794 memblock[i] = dst->pattern->pattern(i);
1795 }
1796 }
1797 }
1798 }
1799 return true;
1800}
1801
1802// Get memory for an incoming data transfer..
1803bool FileThread::PagePrepare() {
1804 // We can only do direct IO to SAT pages if it is normal mem.
1805 page_io_ = os_->normal_mem();
1806
1807 // Init a local buffer if we need it.
1808 if (!page_io_) {
Scott Anderson8f1c60d2012-02-17 14:25:17 -08001809#ifdef HAVE_POSIX_MEMALIGN
Scott Andersonb0114cb2012-04-09 14:08:22 -07001810 int result = posix_memalign(&local_page_, 512, sat_->page_length());
Scott Anderson8f1c60d2012-02-17 14:25:17 -08001811#else
1812 local_page_ = memalign(512, sat_->page_length());
1813 int result = (local_page_ == 0);
1814#endif
Scott Andersonb0114cb2012-04-09 14:08:22 -07001815 if (result) {
1816 logprintf(0, "Process Error: disk thread posix_memalign "
1817 "returned %d (fail)\n",
1818 result);
1819 status_ = false;
1820 return false;
1821 }
1822 }
1823 return true;
1824}
1825
1826
1827// Remove memory allocated for data transfer.
1828bool FileThread::PageTeardown() {
1829 // Free a local buffer if we need to.
1830 if (!page_io_) {
1831 free(local_page_);
1832 }
1833 return true;
1834}
1835
1836
1837
1838// Get memory for an incoming data transfer..
1839bool FileThread::GetEmptyPage(struct page_entry *dst) {
1840 if (page_io_) {
1841 if (!sat_->GetEmpty(dst))
1842 return false;
1843 } else {
1844 dst->addr = local_page_;
1845 dst->offset = 0;
1846 dst->pattern = 0;
1847 }
1848 return true;
1849}
1850
1851// Get memory for an outgoing data transfer..
1852bool FileThread::GetValidPage(struct page_entry *src) {
1853 struct page_entry tmp;
1854 if (!sat_->GetValid(&tmp))
1855 return false;
1856 if (page_io_) {
1857 *src = tmp;
1858 return true;
1859 } else {
1860 src->addr = local_page_;
1861 src->offset = 0;
1862 CrcCopyPage(src, &tmp);
1863 if (!sat_->PutValid(&tmp))
1864 return false;
1865 }
1866 return true;
1867}
1868
1869
1870// Throw out a used empty page.
1871bool FileThread::PutEmptyPage(struct page_entry *src) {
1872 if (page_io_) {
1873 if (!sat_->PutEmpty(src))
1874 return false;
1875 }
1876 return true;
1877}
1878
1879// Throw out a used, filled page.
1880bool FileThread::PutValidPage(struct page_entry *src) {
1881 if (page_io_) {
1882 if (!sat_->PutValid(src))
1883 return false;
1884 }
1885 return true;
1886}
1887
1888// Copy data from file into memory blocks.
1889bool FileThread::ReadPages(int fd) {
1890 int page_length = sat_->page_length();
1891 int strict = sat_->strict();
1892 bool result = true;
1893
1894 // Read our data back out of the file, into it's new location.
1895 lseek64(fd, 0, SEEK_SET);
1896 for (int i = 0; i < sat_->disk_pages(); i++) {
1897 struct page_entry dst;
1898 if (!GetEmptyPage(&dst))
1899 return false;
1900 // Retrieve expected pattern.
1901 dst.pattern = page_recs_[i].pattern;
1902 // Update page recordpage record.
1903 page_recs_[i].dst = dst.addr;
1904
1905 // Read from the file into destination page.
1906 if (!ReadPageFromFile(fd, &dst)) {
1907 PutEmptyPage(&dst);
1908 return false;
1909 }
1910
1911 SectorValidatePage(page_recs_[i], &dst, i);
1912
1913 // Ensure that the transfer ended up with correct data.
1914 if (strict) {
1915 // Record page index currently CRC checked.
1916 crc_page_ = i;
1917 int errors = CrcCheckPage(&dst);
1918 if (errors) {
1919 logprintf(5, "Log: file miscompare at block %d, "
1920 "offset %x-%x. File: %s\n",
1921 i, i * page_length, ((i + 1) * page_length) - 1,
1922 filename_.c_str());
1923 result = false;
1924 }
1925 crc_page_ = -1;
1926 errorcount_ += errors;
1927 }
1928 if (!PutValidPage(&dst))
1929 return false;
1930 }
1931 return result;
1932}
1933
1934// File IO work loop. Execute until marked done.
1935bool FileThread::Work() {
1936 bool result = true;
1937 int64 loops = 0;
1938
1939 logprintf(9, "Log: Starting file thread %d, file %s, device %s\n",
1940 thread_num_,
1941 filename_.c_str(),
1942 devicename_.c_str());
1943
1944 if (!PagePrepare()) {
1945 status_ = false;
1946 return false;
1947 }
1948
1949 // Open the data IO file.
1950 int fd = 0;
1951 if (!OpenFile(&fd)) {
1952 status_ = false;
1953 return false;
1954 }
1955
1956 pass_ = 0;
1957
1958 // Load patterns into page records.
1959 page_recs_ = new struct PageRec[sat_->disk_pages()];
1960 for (int i = 0; i < sat_->disk_pages(); i++) {
1961 page_recs_[i].pattern = new struct Pattern();
1962 }
1963
1964 // Loop until done.
1965 while (IsReadyToRun()) {
1966 // Do the file write.
1967 if (!(result = result && WritePages(fd)))
1968 break;
1969
1970 // Do the file read.
1971 if (!(result = result && ReadPages(fd)))
1972 break;
1973
1974 loops++;
1975 pass_ = loops;
1976 }
1977
1978 pages_copied_ = loops * sat_->disk_pages();
1979
1980 // Clean up.
1981 CloseFile(fd);
1982 PageTeardown();
1983
1984 logprintf(9, "Log: Completed %d: file thread status %d, %d pages copied\n",
1985 thread_num_, status_, pages_copied_);
1986 // Failure to read from device indicates hardware,
1987 // rather than procedural SW error.
1988 status_ = true;
1989 return true;
1990}
1991
1992bool NetworkThread::IsNetworkStopSet() {
1993 return !IsReadyToRunNoPause();
1994}
1995
1996bool NetworkSlaveThread::IsNetworkStopSet() {
1997 // This thread has no completion status.
1998 // It finishes whever there is no more data to be
1999 // passed back.
2000 return true;
2001}
2002
2003// Set ip name to use for Network IO.
2004void NetworkThread::SetIP(const char *ipaddr_init) {
2005 strncpy(ipaddr_, ipaddr_init, 256);
2006}
2007
2008// Create a socket.
2009// Return 0 on error.
2010bool NetworkThread::CreateSocket(int *psocket) {
2011 int sock = socket(AF_INET, SOCK_STREAM, 0);
2012 if (sock == -1) {
2013 logprintf(0, "Process Error: Cannot open socket\n");
2014 pages_copied_ = 0;
2015 status_ = false;
2016 return false;
2017 }
2018 *psocket = sock;
2019 return true;
2020}
2021
2022// Close the socket.
2023bool NetworkThread::CloseSocket(int sock) {
2024 close(sock);
2025 return true;
2026}
2027
2028// Initiate the tcp connection.
2029bool NetworkThread::Connect(int sock) {
2030 struct sockaddr_in dest_addr;
2031 dest_addr.sin_family = AF_INET;
2032 dest_addr.sin_port = htons(kNetworkPort);
2033 memset(&(dest_addr.sin_zero), '\0', sizeof(dest_addr.sin_zero));
2034
2035 // Translate dot notation to u32.
2036 if (inet_aton(ipaddr_, &dest_addr.sin_addr) == 0) {
2037 logprintf(0, "Process Error: Cannot resolve %s\n", ipaddr_);
2038 pages_copied_ = 0;
2039 status_ = false;
2040 return false;
2041 }
2042
2043 if (-1 == connect(sock, reinterpret_cast<struct sockaddr *>(&dest_addr),
2044 sizeof(struct sockaddr))) {
2045 logprintf(0, "Process Error: Cannot connect %s\n", ipaddr_);
2046 pages_copied_ = 0;
2047 status_ = false;
2048 return false;
2049 }
2050 return true;
2051}
2052
2053// Initiate the tcp connection.
2054bool NetworkListenThread::Listen() {
2055 struct sockaddr_in sa;
2056
2057 memset(&(sa.sin_zero), '\0', sizeof(sa.sin_zero));
2058
2059 sa.sin_family = AF_INET;
2060 sa.sin_addr.s_addr = INADDR_ANY;
2061 sa.sin_port = htons(kNetworkPort);
2062
2063 if (-1 == bind(sock_, (struct sockaddr*)&sa, sizeof(struct sockaddr))) {
2064 char buf[256];
2065 sat_strerror(errno, buf, sizeof(buf));
2066 logprintf(0, "Process Error: Cannot bind socket: %s\n", buf);
2067 pages_copied_ = 0;
2068 status_ = false;
2069 return false;
2070 }
2071 listen(sock_, 3);
2072 return true;
2073}
2074
2075// Wait for a connection from a network traffic generation thread.
2076bool NetworkListenThread::Wait() {
2077 fd_set rfds;
2078 struct timeval tv;
2079 int retval;
2080
2081 // Watch sock_ to see when it has input.
2082 FD_ZERO(&rfds);
2083 FD_SET(sock_, &rfds);
2084 // Wait up to five seconds.
2085 tv.tv_sec = 5;
2086 tv.tv_usec = 0;
2087
2088 retval = select(sock_ + 1, &rfds, NULL, NULL, &tv);
2089
2090 return (retval > 0);
2091}
2092
2093// Wait for a connection from a network traffic generation thread.
2094bool NetworkListenThread::GetConnection(int *pnewsock) {
2095 struct sockaddr_in sa;
2096 socklen_t size = sizeof(struct sockaddr_in);
2097
2098 int newsock = accept(sock_, reinterpret_cast<struct sockaddr *>(&sa), &size);
2099 if (newsock < 0) {
2100 logprintf(0, "Process Error: Did not receive connection\n");
2101 pages_copied_ = 0;
2102 status_ = false;
2103 return false;
2104 }
2105 *pnewsock = newsock;
2106 return true;
2107}
2108
2109// Send a page, return false if a page was not sent.
2110bool NetworkThread::SendPage(int sock, struct page_entry *src) {
2111 int page_length = sat_->page_length();
2112 char *address = static_cast<char*>(src->addr);
2113
2114 // Send our data over the network.
2115 int size = page_length;
2116 while (size) {
2117 int transferred = send(sock, address + (page_length - size), size, 0);
2118 if ((transferred == 0) || (transferred == -1)) {
2119 if (!IsNetworkStopSet()) {
2120 char buf[256] = "";
2121 sat_strerror(errno, buf, sizeof(buf));
2122 logprintf(0, "Process Error: Thread %d, "
2123 "Network write failed, bailing. (%s)\n",
2124 thread_num_, buf);
2125 status_ = false;
2126 }
2127 return false;
2128 }
2129 size = size - transferred;
2130 }
2131 return true;
2132}
2133
2134// Receive a page. Return false if a page was not received.
2135bool NetworkThread::ReceivePage(int sock, struct page_entry *dst) {
2136 int page_length = sat_->page_length();
2137 char *address = static_cast<char*>(dst->addr);
2138
2139 // Maybe we will get our data back again, maybe not.
2140 int size = page_length;
2141 while (size) {
2142 int transferred = recv(sock, address + (page_length - size), size, 0);
2143 if ((transferred == 0) || (transferred == -1)) {
2144 // Typically network slave thread should exit as network master
2145 // thread stops sending data.
2146 if (IsNetworkStopSet()) {
2147 int err = errno;
2148 if (transferred == 0 && err == 0) {
2149 // Two system setups will not sync exactly,
2150 // allow early exit, but log it.
2151 logprintf(0, "Log: Net thread did not receive any data, exiting.\n");
2152 } else {
2153 char buf[256] = "";
2154 sat_strerror(err, buf, sizeof(buf));
2155 // Print why we failed.
2156 logprintf(0, "Process Error: Thread %d, "
2157 "Network read failed, bailing (%s).\n",
2158 thread_num_, buf);
2159 status_ = false;
2160 // Print arguments and results.
2161 logprintf(0, "Log: recv(%d, address %x, size %x, 0) == %x, err %d\n",
2162 sock, address + (page_length - size),
2163 size, transferred, err);
2164 if ((transferred == 0) &&
2165 (page_length - size < 512) &&
2166 (page_length - size > 0)) {
2167 // Print null terminated data received, to see who's been
2168 // sending us supicious unwanted data.
2169 address[page_length - size] = 0;
2170 logprintf(0, "Log: received %d bytes: '%s'\n",
2171 page_length - size, address);
2172 }
2173 }
2174 }
2175 return false;
2176 }
2177 size = size - transferred;
2178 }
2179 return true;
2180}
2181
2182// Network IO work loop. Execute until marked done.
2183// Return true if the thread ran as expected.
2184bool NetworkThread::Work() {
2185 logprintf(9, "Log: Starting network thread %d, ip %s\n",
2186 thread_num_,
2187 ipaddr_);
2188
2189 // Make a socket.
2190 int sock = 0;
2191 if (!CreateSocket(&sock))
2192 return false;
2193
2194 // Network IO loop requires network slave thread to have already initialized.
2195 // We will sleep here for awhile to ensure that the slave thread will be
2196 // listening by the time we connect.
2197 // Sleep for 15 seconds.
2198 sat_sleep(15);
2199 logprintf(9, "Log: Starting execution of network thread %d, ip %s\n",
2200 thread_num_,
2201 ipaddr_);
2202
2203
2204 // Connect to a slave thread.
2205 if (!Connect(sock))
2206 return false;
2207
2208 // Loop until done.
2209 bool result = true;
2210 int strict = sat_->strict();
2211 int64 loops = 0;
2212 while (IsReadyToRun()) {
2213 struct page_entry src;
2214 struct page_entry dst;
2215 result = result && sat_->GetValid(&src);
2216 result = result && sat_->GetEmpty(&dst);
2217 if (!result) {
2218 logprintf(0, "Process Error: net_thread failed to pop pages, "
2219 "bailing\n");
2220 break;
2221 }
2222
2223 // Check data correctness.
2224 if (strict)
2225 CrcCheckPage(&src);
2226
2227 // Do the network write.
2228 if (!(result = result && SendPage(sock, &src)))
2229 break;
2230
2231 // Update pattern reference to reflect new contents.
2232 dst.pattern = src.pattern;
2233
2234 // Do the network read.
2235 if (!(result = result && ReceivePage(sock, &dst)))
2236 break;
2237
2238 // Ensure that the transfer ended up with correct data.
2239 if (strict)
2240 CrcCheckPage(&dst);
2241
2242 // Return all of our pages to the queue.
2243 result = result && sat_->PutValid(&dst);
2244 result = result && sat_->PutEmpty(&src);
2245 if (!result) {
2246 logprintf(0, "Process Error: net_thread failed to push pages, "
2247 "bailing\n");
2248 break;
2249 }
2250 loops++;
2251 }
2252
2253 pages_copied_ = loops;
2254 status_ = result;
2255
2256 // Clean up.
2257 CloseSocket(sock);
2258
2259 logprintf(9, "Log: Completed %d: network thread status %d, "
2260 "%d pages copied\n",
2261 thread_num_, status_, pages_copied_);
2262 return result;
2263}
2264
2265// Spawn slave threads for incoming connections.
2266bool NetworkListenThread::SpawnSlave(int newsock, int threadid) {
2267 logprintf(12, "Log: Listen thread spawning slave\n");
2268
2269 // Spawn slave thread, to reflect network traffic back to sender.
2270 ChildWorker *child_worker = new ChildWorker;
2271 child_worker->thread.SetSock(newsock);
2272 child_worker->thread.InitThread(threadid, sat_, os_, patternlist_,
2273 &child_worker->status);
2274 child_worker->status.Initialize();
2275 child_worker->thread.SpawnThread();
2276 child_workers_.push_back(child_worker);
2277
2278 return true;
2279}
2280
2281// Reap slave threads.
2282bool NetworkListenThread::ReapSlaves() {
2283 bool result = true;
2284 // Gather status and reap threads.
2285 logprintf(12, "Log: Joining all outstanding threads\n");
2286
2287 for (size_t i = 0; i < child_workers_.size(); i++) {
2288 NetworkSlaveThread& child_thread = child_workers_[i]->thread;
2289 logprintf(12, "Log: Joining slave thread %d\n", i);
2290 child_thread.JoinThread();
2291 if (child_thread.GetStatus() != 1) {
2292 logprintf(0, "Process Error: Slave Thread %d failed with status %d\n", i,
2293 child_thread.GetStatus());
2294 result = false;
2295 }
2296 errorcount_ += child_thread.GetErrorCount();
2297 logprintf(9, "Log: Slave Thread %d found %lld miscompares\n", i,
2298 child_thread.GetErrorCount());
2299 pages_copied_ += child_thread.GetPageCount();
2300 }
2301
2302 return result;
2303}
2304
2305// Network listener IO work loop. Execute until marked done.
2306// Return false on fatal software error.
2307bool NetworkListenThread::Work() {
2308 logprintf(9, "Log: Starting network listen thread %d\n",
2309 thread_num_);
2310
2311 // Make a socket.
2312 sock_ = 0;
2313 if (!CreateSocket(&sock_)) {
2314 status_ = false;
2315 return false;
2316 }
2317 logprintf(9, "Log: Listen thread created sock\n");
2318
2319 // Allows incoming connections to be queued up by socket library.
2320 int newsock = 0;
2321 Listen();
2322 logprintf(12, "Log: Listen thread waiting for incoming connections\n");
2323
2324 // Wait on incoming connections, and spawn worker threads for them.
2325 int threadcount = 0;
2326 while (IsReadyToRun()) {
2327 // Poll for connections that we can accept().
2328 if (Wait()) {
2329 // Accept those connections.
2330 logprintf(12, "Log: Listen thread found incoming connection\n");
2331 if (GetConnection(&newsock)) {
2332 SpawnSlave(newsock, threadcount);
2333 threadcount++;
2334 }
2335 }
2336 }
2337
2338 // Gather status and join spawned threads.
2339 ReapSlaves();
2340
2341 // Delete the child workers.
2342 for (ChildVector::iterator it = child_workers_.begin();
2343 it != child_workers_.end(); ++it) {
2344 (*it)->status.Destroy();
2345 delete *it;
2346 }
2347 child_workers_.clear();
2348
2349 CloseSocket(sock_);
2350
2351 status_ = true;
2352 logprintf(9,
2353 "Log: Completed %d: network listen thread status %d, "
2354 "%d pages copied\n",
2355 thread_num_, status_, pages_copied_);
2356 return true;
2357}
2358
2359// Set network reflector socket struct.
2360void NetworkSlaveThread::SetSock(int sock) {
2361 sock_ = sock;
2362}
2363
2364// Network reflector IO work loop. Execute until marked done.
2365// Return false on fatal software error.
2366bool NetworkSlaveThread::Work() {
2367 logprintf(9, "Log: Starting network slave thread %d\n",
2368 thread_num_);
2369
2370 // Verify that we have a socket.
2371 int sock = sock_;
2372 if (!sock) {
2373 status_ = false;
2374 return false;
2375 }
2376
2377 // Loop until done.
2378 int64 loops = 0;
2379 // Init a local buffer for storing data.
2380 void *local_page = NULL;
Scott Anderson8f1c60d2012-02-17 14:25:17 -08002381#ifdef HAVE_POSIX_MEMALIGN
Scott Andersonb0114cb2012-04-09 14:08:22 -07002382 int result = posix_memalign(&local_page, 512, sat_->page_length());
Scott Anderson8f1c60d2012-02-17 14:25:17 -08002383#else
2384 local_page = memalign(512, sat_->page_length());
2385 int result = (local_page == 0);
2386#endif
Scott Andersonb0114cb2012-04-09 14:08:22 -07002387 if (result) {
2388 logprintf(0, "Process Error: net slave posix_memalign "
2389 "returned %d (fail)\n",
2390 result);
2391 status_ = false;
2392 return false;
2393 }
2394
2395 struct page_entry page;
2396 page.addr = local_page;
2397
2398 // This thread will continue to run as long as the thread on the other end of
2399 // the socket is still sending and receiving data.
2400 while (1) {
2401 // Do the network read.
2402 if (!ReceivePage(sock, &page))
2403 break;
2404
2405 // Do the network write.
2406 if (!SendPage(sock, &page))
2407 break;
2408
2409 loops++;
2410 }
2411
2412 pages_copied_ = loops;
2413 // No results provided from this type of thread.
2414 status_ = true;
2415
2416 // Clean up.
2417 CloseSocket(sock);
2418
2419 logprintf(9,
2420 "Log: Completed %d: network slave thread status %d, "
2421 "%d pages copied\n",
2422 thread_num_, status_, pages_copied_);
2423 return true;
2424}
2425
2426// Thread work loop. Execute until marked finished.
2427bool ErrorPollThread::Work() {
2428 logprintf(9, "Log: Starting system error poll thread %d\n", thread_num_);
2429
2430 // This calls a generic error polling function in the Os abstraction layer.
2431 do {
2432 errorcount_ += os_->ErrorPoll();
2433 os_->ErrorWait();
2434 } while (IsReadyToRun());
2435
2436 logprintf(9, "Log: Finished system error poll thread %d: %d errors\n",
2437 thread_num_, errorcount_);
2438 status_ = true;
2439 return true;
2440}
2441
2442// Worker thread to heat up CPU.
2443// This thread does not evaluate pass/fail or software error.
2444bool CpuStressThread::Work() {
2445 logprintf(9, "Log: Starting CPU stress thread %d\n", thread_num_);
2446
2447 do {
2448 // Run ludloff's platform/CPU-specific assembly workload.
2449 os_->CpuStressWorkload();
2450 YieldSelf();
2451 } while (IsReadyToRun());
2452
2453 logprintf(9, "Log: Finished CPU stress thread %d:\n",
2454 thread_num_);
2455 status_ = true;
2456 return true;
2457}
2458
2459CpuCacheCoherencyThread::CpuCacheCoherencyThread(cc_cacheline_data *data,
2460 int cacheline_count,
2461 int thread_num,
2462 int inc_count) {
2463 cc_cacheline_data_ = data;
2464 cc_cacheline_count_ = cacheline_count;
2465 cc_thread_num_ = thread_num;
2466 cc_inc_count_ = inc_count;
2467}
2468
2469// Worked thread to test the cache coherency of the CPUs
2470// Return false on fatal sw error.
2471bool CpuCacheCoherencyThread::Work() {
2472 logprintf(9, "Log: Starting the Cache Coherency thread %d\n",
2473 cc_thread_num_);
2474 uint64 time_start, time_end;
2475 struct timeval tv;
2476
2477 unsigned int seed = static_cast<unsigned int>(gettid());
2478 gettimeofday(&tv, NULL); // Get the timestamp before increments.
2479 time_start = tv.tv_sec * 1000000ULL + tv.tv_usec;
2480
2481 uint64 total_inc = 0; // Total increments done by the thread.
2482 while (IsReadyToRun()) {
2483 for (int i = 0; i < cc_inc_count_; i++) {
2484 // Choose a datastructure in random and increment the appropriate
2485 // member in that according to the offset (which is the same as the
2486 // thread number.
Scott Anderson8f1c60d2012-02-17 14:25:17 -08002487#ifdef HAVE_RAND_R
Scott Andersonb0114cb2012-04-09 14:08:22 -07002488 int r = rand_r(&seed);
Scott Anderson8f1c60d2012-02-17 14:25:17 -08002489#else
2490 int r = rand();
2491#endif
Scott Andersonb0114cb2012-04-09 14:08:22 -07002492 r = cc_cacheline_count_ * (r / (RAND_MAX + 1.0));
2493 // Increment the member of the randomely selected structure.
2494 (cc_cacheline_data_[r].num[cc_thread_num_])++;
2495 }
2496
2497 total_inc += cc_inc_count_;
2498
2499 // Calculate if the local counter matches with the global value
2500 // in all the cache line structures for this particular thread.
2501 int cc_global_num = 0;
2502 for (int cline_num = 0; cline_num < cc_cacheline_count_; cline_num++) {
2503 cc_global_num += cc_cacheline_data_[cline_num].num[cc_thread_num_];
2504 // Reset the cachline member's value for the next run.
2505 cc_cacheline_data_[cline_num].num[cc_thread_num_] = 0;
2506 }
2507 if (sat_->error_injection())
2508 cc_global_num = -1;
2509
2510 if (cc_global_num != cc_inc_count_) {
2511 errorcount_++;
2512 logprintf(0, "Hardware Error: global(%d) and local(%d) do not match\n",
2513 cc_global_num, cc_inc_count_);
2514 }
2515 }
2516 gettimeofday(&tv, NULL); // Get the timestamp at the end.
2517 time_end = tv.tv_sec * 1000000ULL + tv.tv_usec;
2518
2519 uint64 us_elapsed = time_end - time_start;
2520 // inc_rate is the no. of increments per second.
2521 double inc_rate = total_inc * 1e6 / us_elapsed;
2522
2523 logprintf(4, "Stats: CC Thread(%d): Time=%llu us,"
2524 " Increments=%llu, Increments/sec = %.6lf\n",
2525 cc_thread_num_, us_elapsed, total_inc, inc_rate);
2526 logprintf(9, "Log: Finished CPU Cache Coherency thread %d:\n",
2527 cc_thread_num_);
2528 status_ = true;
2529 return true;
2530}
2531
2532DiskThread::DiskThread(DiskBlockTable *block_table) {
2533 read_block_size_ = kSectorSize; // default 1 sector (512 bytes)
2534 write_block_size_ = kSectorSize; // this assumes read and write block size
2535 // are the same
2536 segment_size_ = -1; // use the entire disk as one segment
2537 cache_size_ = 16 * 1024 * 1024; // assume 16MiB cache by default
2538 // Use a queue such that 3/2 times as much data as the cache can hold
2539 // is written before it is read so that there is little chance the read
2540 // data is in the cache.
2541 queue_size_ = ((cache_size_ / write_block_size_) * 3) / 2;
2542 blocks_per_segment_ = 32;
2543
2544 read_threshold_ = 100000; // 100ms is a reasonable limit for
2545 write_threshold_ = 100000; // reading/writing a sector
2546
2547 read_timeout_ = 5000000; // 5 seconds should be long enough for a
2548 write_timeout_ = 5000000; // timout for reading/writing
2549
2550 device_sectors_ = 0;
2551 non_destructive_ = 0;
2552
Scott Anderson8f1c60d2012-02-17 14:25:17 -08002553#ifdef HAVE_LIBAIO_H
Scott Andersonb0114cb2012-04-09 14:08:22 -07002554 aio_ctx_ = 0;
Scott Anderson8f1c60d2012-02-17 14:25:17 -08002555#endif
Scott Andersonb0114cb2012-04-09 14:08:22 -07002556 block_table_ = block_table;
2557 update_block_table_ = 1;
2558
2559 block_buffer_ = NULL;
2560
2561 blocks_written_ = 0;
2562 blocks_read_ = 0;
2563}
2564
2565DiskThread::~DiskThread() {
2566 if (block_buffer_)
2567 free(block_buffer_);
2568}
2569
2570// Set filename for device file (in /dev).
2571void DiskThread::SetDevice(const char *device_name) {
2572 device_name_ = device_name;
2573}
2574
2575// Set various parameters that control the behaviour of the test.
2576// -1 is used as a sentinel value on each parameter (except non_destructive)
2577// to indicate that the parameter not be set.
2578bool DiskThread::SetParameters(int read_block_size,
2579 int write_block_size,
2580 int64 segment_size,
2581 int64 cache_size,
2582 int blocks_per_segment,
2583 int64 read_threshold,
2584 int64 write_threshold,
2585 int non_destructive) {
2586 if (read_block_size != -1) {
2587 // Blocks must be aligned to the disk's sector size.
2588 if (read_block_size % kSectorSize != 0) {
2589 logprintf(0, "Process Error: Block size must be a multiple of %d "
2590 "(thread %d).\n", kSectorSize, thread_num_);
2591 return false;
2592 }
2593
2594 read_block_size_ = read_block_size;
2595 }
2596
2597 if (write_block_size != -1) {
2598 // Write blocks must be aligned to the disk's sector size and to the
2599 // block size.
2600 if (write_block_size % kSectorSize != 0) {
2601 logprintf(0, "Process Error: Write block size must be a multiple "
2602 "of %d (thread %d).\n", kSectorSize, thread_num_);
2603 return false;
2604 }
2605 if (write_block_size % read_block_size_ != 0) {
2606 logprintf(0, "Process Error: Write block size must be a multiple "
2607 "of the read block size, which is %d (thread %d).\n",
2608 read_block_size_, thread_num_);
2609 return false;
2610 }
2611
2612 write_block_size_ = write_block_size;
2613
2614 } else {
2615 // Make sure write_block_size_ is still valid.
2616 if (read_block_size_ > write_block_size_) {
2617 logprintf(5, "Log: Assuming write block size equal to read block size, "
2618 "which is %d (thread %d).\n", read_block_size_,
2619 thread_num_);
2620 write_block_size_ = read_block_size_;
2621 } else {
2622 if (write_block_size_ % read_block_size_ != 0) {
2623 logprintf(0, "Process Error: Write block size (defined as %d) must "
2624 "be a multiple of the read block size, which is %d "
2625 "(thread %d).\n", write_block_size_, read_block_size_,
2626 thread_num_);
2627 return false;
2628 }
2629 }
2630 }
2631
2632 if (cache_size != -1) {
2633 cache_size_ = cache_size;
2634 }
2635
2636 if (blocks_per_segment != -1) {
2637 if (blocks_per_segment <= 0) {
2638 logprintf(0, "Process Error: Blocks per segment must be greater than "
2639 "zero.\n (thread %d)", thread_num_);
2640 return false;
2641 }
2642
2643 blocks_per_segment_ = blocks_per_segment;
2644 }
2645
2646 if (read_threshold != -1) {
2647 if (read_threshold <= 0) {
2648 logprintf(0, "Process Error: Read threshold must be greater than "
2649 "zero (thread %d).\n", thread_num_);
2650 return false;
2651 }
2652
2653 read_threshold_ = read_threshold;
2654 }
2655
2656 if (write_threshold != -1) {
2657 if (write_threshold <= 0) {
2658 logprintf(0, "Process Error: Write threshold must be greater than "
2659 "zero (thread %d).\n", thread_num_);
2660 return false;
2661 }
2662
2663 write_threshold_ = write_threshold;
2664 }
2665
2666 if (segment_size != -1) {
2667 // Segments must be aligned to the disk's sector size.
2668 if (segment_size % kSectorSize != 0) {
2669 logprintf(0, "Process Error: Segment size must be a multiple of %d"
2670 " (thread %d).\n", kSectorSize, thread_num_);
2671 return false;
2672 }
2673
2674 segment_size_ = segment_size / kSectorSize;
2675 }
2676
2677 non_destructive_ = non_destructive;
2678
2679 // Having a queue of 150% of blocks that will fit in the disk's cache
2680 // should be enough to force out the oldest block before it is read and hence,
2681 // making sure the data comes form the disk and not the cache.
2682 queue_size_ = ((cache_size_ / write_block_size_) * 3) / 2;
2683 // Updating DiskBlockTable parameters
2684 if (update_block_table_) {
2685 block_table_->SetParameters(kSectorSize, write_block_size_,
2686 device_sectors_, segment_size_,
2687 device_name_);
2688 }
2689 return true;
2690}
2691
2692// Open a device, return false on failure.
2693bool DiskThread::OpenDevice(int *pfile) {
2694 int fd = open(device_name_.c_str(),
2695 O_RDWR | O_SYNC | O_DIRECT | O_LARGEFILE,
2696 0);
2697 if (fd < 0) {
2698 logprintf(0, "Process Error: Failed to open device %s (thread %d)!!\n",
2699 device_name_.c_str(), thread_num_);
2700 return false;
2701 }
2702 *pfile = fd;
2703
2704 return GetDiskSize(fd);
2705}
2706
2707// Retrieves the size (in bytes) of the disk/file.
2708// Return false on failure.
2709bool DiskThread::GetDiskSize(int fd) {
2710 struct stat device_stat;
2711 if (fstat(fd, &device_stat) == -1) {
2712 logprintf(0, "Process Error: Unable to fstat disk %s (thread %d).\n",
2713 device_name_.c_str(), thread_num_);
2714 return false;
2715 }
2716
2717 // For a block device, an ioctl is needed to get the size since the size
2718 // of the device file (i.e. /dev/sdb) is 0.
2719 if (S_ISBLK(device_stat.st_mode)) {
2720 uint64 block_size = 0;
2721
2722 if (ioctl(fd, BLKGETSIZE64, &block_size) == -1) {
2723 logprintf(0, "Process Error: Unable to ioctl disk %s (thread %d).\n",
2724 device_name_.c_str(), thread_num_);
2725 return false;
2726 }
2727
2728 // Zero size indicates nonworking device..
2729 if (block_size == 0) {
2730 os_->ErrorReport(device_name_.c_str(), "device-size-zero", 1);
2731 ++errorcount_;
2732 status_ = true; // Avoid a procedural error.
2733 return false;
2734 }
2735
2736 device_sectors_ = block_size / kSectorSize;
2737
2738 } else if (S_ISREG(device_stat.st_mode)) {
2739 device_sectors_ = device_stat.st_size / kSectorSize;
2740
2741 } else {
2742 logprintf(0, "Process Error: %s is not a regular file or block "
2743 "device (thread %d).\n", device_name_.c_str(),
2744 thread_num_);
2745 return false;
2746 }
2747
2748 logprintf(12, "Log: Device sectors: %lld on disk %s (thread %d).\n",
2749 device_sectors_, device_name_.c_str(), thread_num_);
2750
2751 if (update_block_table_) {
2752 block_table_->SetParameters(kSectorSize, write_block_size_,
2753 device_sectors_, segment_size_,
2754 device_name_);
2755 }
2756
2757 return true;
2758}
2759
2760bool DiskThread::CloseDevice(int fd) {
2761 close(fd);
2762 return true;
2763}
2764
2765// Return the time in microseconds.
2766int64 DiskThread::GetTime() {
2767 struct timeval tv;
2768 gettimeofday(&tv, NULL);
2769 return tv.tv_sec * 1000000 + tv.tv_usec;
2770}
2771
2772// Do randomized reads and (possibly) writes on a device.
2773// Return false on fatal SW error, true on SW success,
2774// regardless of whether HW failed.
2775bool DiskThread::DoWork(int fd) {
2776 int64 block_num = 0;
2777 int64 num_segments;
2778
2779 if (segment_size_ == -1) {
2780 num_segments = 1;
2781 } else {
2782 num_segments = device_sectors_ / segment_size_;
2783 if (device_sectors_ % segment_size_ != 0)
2784 num_segments++;
2785 }
2786
2787 // Disk size should be at least 3x cache size. See comment later for
2788 // details.
2789 sat_assert(device_sectors_ * kSectorSize > 3 * cache_size_);
2790
2791 // This disk test works by writing blocks with a certain pattern to
2792 // disk, then reading them back and verifying it against the pattern
2793 // at a later time. A failure happens when either the block cannot
2794 // be written/read or when the read block is different than what was
2795 // written. If a block takes too long to write/read, then a warning
2796 // is given instead of an error since taking too long is not
2797 // necessarily an error.
2798 //
2799 // To prevent the read blocks from coming from the disk cache,
2800 // enough blocks are written before read such that a block would
2801 // be ejected from the disk cache by the time it is read.
2802 //
2803 // TODO(amistry): Implement some sort of read/write throttling. The
2804 // flood of asynchronous I/O requests when a drive is
2805 // unplugged is causing the application and kernel to
2806 // become unresponsive.
2807
2808 while (IsReadyToRun()) {
2809 // Write blocks to disk.
2810 logprintf(16, "Log: Write phase %sfor disk %s (thread %d).\n",
2811 non_destructive_ ? "(disabled) " : "",
2812 device_name_.c_str(), thread_num_);
2813 while (IsReadyToRunNoPause() &&
2814 in_flight_sectors_.size() <
2815 static_cast<size_t>(queue_size_ + 1)) {
2816 // Confine testing to a particular segment of the disk.
2817 int64 segment = (block_num / blocks_per_segment_) % num_segments;
2818 if (!non_destructive_ &&
2819 (block_num % blocks_per_segment_ == 0)) {
2820 logprintf(20, "Log: Starting to write segment %lld out of "
2821 "%lld on disk %s (thread %d).\n",
2822 segment, num_segments, device_name_.c_str(),
2823 thread_num_);
2824 }
2825 block_num++;
2826
2827 BlockData *block = block_table_->GetUnusedBlock(segment);
2828
2829 // If an unused sequence of sectors could not be found, skip to the
2830 // next block to process. Soon, a new segment will come and new
2831 // sectors will be able to be allocated. This effectively puts a
2832 // minumim on the disk size at 3x the stated cache size, or 48MiB
2833 // if a cache size is not given (since the cache is set as 16MiB
2834 // by default). Given that todays caches are at the low MiB range
2835 // and drive sizes at the mid GB, this shouldn't pose a problem.
2836 // The 3x minimum comes from the following:
2837 // 1. In order to allocate 'y' blocks from a segment, the
2838 // segment must contain at least 2y blocks or else an
2839 // allocation may not succeed.
2840 // 2. Assume the entire disk is one segment.
2841 // 3. A full write phase consists of writing blocks corresponding to
2842 // 3/2 cache size.
2843 // 4. Therefore, the one segment must have 2 * 3/2 * cache
2844 // size worth of blocks = 3 * cache size worth of blocks
2845 // to complete.
2846 // In non-destructive mode, don't write anything to disk.
2847 if (!non_destructive_) {
2848 if (!WriteBlockToDisk(fd, block)) {
2849 block_table_->RemoveBlock(block);
2850 return true;
2851 }
2852 blocks_written_++;
2853 }
2854
2855 // Block is either initialized by writing, or in nondestructive case,
2856 // initialized by being added into the datastructure for later reading.
2857 block->SetBlockAsInitialized();
2858
2859 in_flight_sectors_.push(block);
2860 }
2861
2862 // Verify blocks on disk.
2863 logprintf(20, "Log: Read phase for disk %s (thread %d).\n",
2864 device_name_.c_str(), thread_num_);
2865 while (IsReadyToRunNoPause() && !in_flight_sectors_.empty()) {
2866 BlockData *block = in_flight_sectors_.front();
2867 in_flight_sectors_.pop();
2868 if (!ValidateBlockOnDisk(fd, block))
2869 return true;
2870 block_table_->RemoveBlock(block);
2871 blocks_read_++;
2872 }
2873 }
2874
2875 pages_copied_ = blocks_written_ + blocks_read_;
2876 return true;
2877}
2878
2879// Do an asynchronous disk I/O operation.
2880// Return false if the IO is not set up.
2881bool DiskThread::AsyncDiskIO(IoOp op, int fd, void *buf, int64 size,
2882 int64 offset, int64 timeout) {
Scott Anderson8f1c60d2012-02-17 14:25:17 -08002883#ifdef HAVE_LIBAIO_H
Scott Andersonb0114cb2012-04-09 14:08:22 -07002884 // Use the Linux native asynchronous I/O interface for reading/writing.
2885 // A read/write consists of three basic steps:
2886 // 1. create an io context.
2887 // 2. prepare and submit an io request to the context
2888 // 3. wait for an event on the context.
2889
2890 struct {
2891 const int opcode;
2892 const char *op_str;
2893 const char *error_str;
2894 } operations[2] = {
2895 { IO_CMD_PREAD, "read", "disk-read-error" },
2896 { IO_CMD_PWRITE, "write", "disk-write-error" }
2897 };
2898
2899 struct iocb cb;
2900 memset(&cb, 0, sizeof(cb));
2901
2902 cb.aio_fildes = fd;
2903 cb.aio_lio_opcode = operations[op].opcode;
2904 cb.u.c.buf = buf;
2905 cb.u.c.nbytes = size;
2906 cb.u.c.offset = offset;
2907
2908 struct iocb *cbs[] = { &cb };
2909 if (io_submit(aio_ctx_, 1, cbs) != 1) {
2910 int error = errno;
2911 char buf[256];
2912 sat_strerror(error, buf, sizeof(buf));
2913 logprintf(0, "Process Error: Unable to submit async %s "
2914 "on disk %s (thread %d). Error %d, %s\n",
2915 operations[op].op_str, device_name_.c_str(),
2916 thread_num_, error, buf);
2917 return false;
2918 }
2919
2920 struct io_event event;
2921 memset(&event, 0, sizeof(event));
2922 struct timespec tv;
2923 tv.tv_sec = timeout / 1000000;
2924 tv.tv_nsec = (timeout % 1000000) * 1000;
2925 if (io_getevents(aio_ctx_, 1, 1, &event, &tv) != 1) {
2926 // A ctrl-c from the keyboard will cause io_getevents to fail with an
2927 // EINTR error code. This is not an error and so don't treat it as such,
2928 // but still log it.
2929 int error = errno;
2930 if (error == EINTR) {
2931 logprintf(5, "Log: %s interrupted on disk %s (thread %d).\n",
2932 operations[op].op_str, device_name_.c_str(),
2933 thread_num_);
2934 } else {
2935 os_->ErrorReport(device_name_.c_str(), operations[op].error_str, 1);
2936 errorcount_ += 1;
2937 logprintf(0, "Hardware Error: Timeout doing async %s to sectors "
2938 "starting at %lld on disk %s (thread %d).\n",
2939 operations[op].op_str, offset / kSectorSize,
2940 device_name_.c_str(), thread_num_);
2941 }
2942
2943 // Don't bother checking return codes since io_cancel seems to always fail.
2944 // Since io_cancel is always failing, destroying and recreating an I/O
2945 // context is a workaround for canceling an in-progress I/O operation.
2946 // TODO(amistry): Find out why io_cancel isn't working and make it work.
2947 io_cancel(aio_ctx_, &cb, &event);
2948 io_destroy(aio_ctx_);
2949 aio_ctx_ = 0;
2950 if (io_setup(5, &aio_ctx_)) {
2951 int error = errno;
2952 char buf[256];
2953 sat_strerror(error, buf, sizeof(buf));
2954 logprintf(0, "Process Error: Unable to create aio context on disk %s"
2955 " (thread %d) Error %d, %s\n",
2956 device_name_.c_str(), thread_num_, error, buf);
2957 }
2958
2959 return false;
2960 }
2961
2962 // event.res contains the number of bytes written/read or
2963 // error if < 0, I think.
2964 if (event.res != static_cast<uint64>(size)) {
2965 errorcount_++;
2966 os_->ErrorReport(device_name_.c_str(), operations[op].error_str, 1);
2967
2968 if (event.res < 0) {
2969 switch (event.res) {
2970 case -EIO:
2971 logprintf(0, "Hardware Error: Low-level I/O error while doing %s to "
2972 "sectors starting at %lld on disk %s (thread %d).\n",
2973 operations[op].op_str, offset / kSectorSize,
2974 device_name_.c_str(), thread_num_);
2975 break;
2976 default:
2977 logprintf(0, "Hardware Error: Unknown error while doing %s to "
2978 "sectors starting at %lld on disk %s (thread %d).\n",
2979 operations[op].op_str, offset / kSectorSize,
2980 device_name_.c_str(), thread_num_);
2981 }
2982 } else {
2983 logprintf(0, "Hardware Error: Unable to %s to sectors starting at "
2984 "%lld on disk %s (thread %d).\n",
2985 operations[op].op_str, offset / kSectorSize,
2986 device_name_.c_str(), thread_num_);
2987 }
2988 return false;
2989 }
2990
2991 return true;
Scott Anderson8f1c60d2012-02-17 14:25:17 -08002992#else // !HAVE_LIBAIO_H
2993 return false;
2994#endif
Scott Andersonb0114cb2012-04-09 14:08:22 -07002995}
2996
2997// Write a block to disk.
2998// Return false if the block is not written.
2999bool DiskThread::WriteBlockToDisk(int fd, BlockData *block) {
3000 memset(block_buffer_, 0, block->GetSize());
3001
3002 // Fill block buffer with a pattern
3003 struct page_entry pe;
3004 if (!sat_->GetValid(&pe)) {
3005 // Even though a valid page could not be obatined, it is not an error
3006 // since we can always fill in a pattern directly, albeit slower.
3007 unsigned int *memblock = static_cast<unsigned int *>(block_buffer_);
3008 block->SetPattern(patternlist_->GetRandomPattern());
3009
3010 logprintf(11, "Log: Warning, using pattern fill fallback in "
3011 "DiskThread::WriteBlockToDisk on disk %s (thread %d).\n",
3012 device_name_.c_str(), thread_num_);
3013
3014 for (int i = 0; i < block->GetSize()/wordsize_; i++) {
3015 memblock[i] = block->GetPattern()->pattern(i);
3016 }
3017 } else {
3018 memcpy(block_buffer_, pe.addr, block->GetSize());
3019 block->SetPattern(pe.pattern);
3020 sat_->PutValid(&pe);
3021 }
3022
3023 logprintf(12, "Log: Writing %lld sectors starting at %lld on disk %s"
3024 " (thread %d).\n",
3025 block->GetSize()/kSectorSize, block->GetAddress(),
3026 device_name_.c_str(), thread_num_);
3027
3028 int64 start_time = GetTime();
3029
3030 if (!AsyncDiskIO(ASYNC_IO_WRITE, fd, block_buffer_, block->GetSize(),
3031 block->GetAddress() * kSectorSize, write_timeout_)) {
3032 return false;
3033 }
3034
3035 int64 end_time = GetTime();
3036 logprintf(12, "Log: Writing time: %lld us (thread %d).\n",
3037 end_time - start_time, thread_num_);
3038 if (end_time - start_time > write_threshold_) {
3039 logprintf(5, "Log: Write took %lld us which is longer than threshold "
3040 "%lld us on disk %s (thread %d).\n",
3041 end_time - start_time, write_threshold_, device_name_.c_str(),
3042 thread_num_);
3043 }
3044
3045 return true;
3046}
3047
3048// Verify a block on disk.
3049// Return true if the block was read, also increment errorcount
3050// if the block had data errors or performance problems.
3051bool DiskThread::ValidateBlockOnDisk(int fd, BlockData *block) {
3052 int64 blocks = block->GetSize() / read_block_size_;
3053 int64 bytes_read = 0;
3054 int64 current_blocks;
3055 int64 current_bytes;
3056 uint64 address = block->GetAddress();
3057
3058 logprintf(20, "Log: Reading sectors starting at %lld on disk %s "
3059 "(thread %d).\n",
3060 address, device_name_.c_str(), thread_num_);
3061
3062 // Read block from disk and time the read. If it takes longer than the
3063 // threshold, complain.
3064 if (lseek64(fd, address * kSectorSize, SEEK_SET) == -1) {
3065 logprintf(0, "Process Error: Unable to seek to sector %lld in "
3066 "DiskThread::ValidateSectorsOnDisk on disk %s "
3067 "(thread %d).\n", address, device_name_.c_str(), thread_num_);
3068 return false;
3069 }
3070 int64 start_time = GetTime();
3071
3072 // Split a large write-sized block into small read-sized blocks and
3073 // read them in groups of randomly-sized multiples of read block size.
3074 // This assures all data written on disk by this particular block
3075 // will be tested using a random reading pattern.
3076 while (blocks != 0) {
3077 // Test all read blocks in a written block.
3078 current_blocks = (random() % blocks) + 1;
3079 current_bytes = current_blocks * read_block_size_;
3080
3081 memset(block_buffer_, 0, current_bytes);
3082
3083 logprintf(20, "Log: Reading %lld sectors starting at sector %lld on "
3084 "disk %s (thread %d)\n",
3085 current_bytes / kSectorSize,
3086 (address * kSectorSize + bytes_read) / kSectorSize,
3087 device_name_.c_str(), thread_num_);
3088
3089 if (!AsyncDiskIO(ASYNC_IO_READ, fd, block_buffer_, current_bytes,
3090 address * kSectorSize + bytes_read,
3091 write_timeout_)) {
3092 return false;
3093 }
3094
3095 int64 end_time = GetTime();
3096 logprintf(20, "Log: Reading time: %lld us (thread %d).\n",
3097 end_time - start_time, thread_num_);
3098 if (end_time - start_time > read_threshold_) {
3099 logprintf(5, "Log: Read took %lld us which is longer than threshold "
3100 "%lld us on disk %s (thread %d).\n",
3101 end_time - start_time, read_threshold_,
3102 device_name_.c_str(), thread_num_);
3103 }
3104
3105 // In non-destructive mode, don't compare the block to the pattern since
3106 // the block was never written to disk in the first place.
3107 if (!non_destructive_) {
3108 if (CheckRegion(block_buffer_, block->GetPattern(), current_bytes,
3109 0, bytes_read)) {
3110 os_->ErrorReport(device_name_.c_str(), "disk-pattern-error", 1);
3111 errorcount_ += 1;
3112 logprintf(0, "Hardware Error: Pattern mismatch in block starting at "
3113 "sector %lld in DiskThread::ValidateSectorsOnDisk on "
3114 "disk %s (thread %d).\n",
3115 address, device_name_.c_str(), thread_num_);
3116 }
3117 }
3118
3119 bytes_read += current_blocks * read_block_size_;
3120 blocks -= current_blocks;
3121 }
3122
3123 return true;
3124}
3125
3126// Direct device access thread.
3127// Return false on software error.
3128bool DiskThread::Work() {
3129 int fd;
3130
3131 logprintf(9, "Log: Starting disk thread %d, disk %s\n",
3132 thread_num_, device_name_.c_str());
3133
3134 srandom(time(NULL));
3135
3136 if (!OpenDevice(&fd)) {
3137 status_ = false;
3138 return false;
3139 }
3140
3141 // Allocate a block buffer aligned to 512 bytes since the kernel requires it
Scott Anderson8f1c60d2012-02-17 14:25:17 -08003142 // when using direct IO.
3143#ifdef HAVE_POSIX_MEMALIGN
Scott Andersonb0114cb2012-04-09 14:08:22 -07003144 int memalign_result = posix_memalign(&block_buffer_, kBufferAlignment,
3145 sat_->page_length());
Scott Anderson8f1c60d2012-02-17 14:25:17 -08003146#else
3147 block_buffer_ = memalign(kBufferAlignment, sat_->page_length());
3148 int memalign_result = (block_buffer_ == 0);
3149#endif
Scott Andersonb0114cb2012-04-09 14:08:22 -07003150 if (memalign_result) {
3151 CloseDevice(fd);
3152 logprintf(0, "Process Error: Unable to allocate memory for buffers "
3153 "for disk %s (thread %d) posix memalign returned %d.\n",
3154 device_name_.c_str(), thread_num_, memalign_result);
3155 status_ = false;
3156 return false;
3157 }
3158
Scott Anderson8f1c60d2012-02-17 14:25:17 -08003159#ifdef HAVE_LIBAIO_H
Scott Andersonb0114cb2012-04-09 14:08:22 -07003160 if (io_setup(5, &aio_ctx_)) {
3161 CloseDevice(fd);
3162 logprintf(0, "Process Error: Unable to create aio context for disk %s"
3163 " (thread %d).\n",
3164 device_name_.c_str(), thread_num_);
3165 status_ = false;
3166 return false;
3167 }
Scott Anderson8f1c60d2012-02-17 14:25:17 -08003168#endif
Scott Andersonb0114cb2012-04-09 14:08:22 -07003169
3170 bool result = DoWork(fd);
3171
3172 status_ = result;
3173
Scott Anderson8f1c60d2012-02-17 14:25:17 -08003174#ifdef HAVE_LIBAIO_H
Scott Andersonb0114cb2012-04-09 14:08:22 -07003175 io_destroy(aio_ctx_);
Scott Anderson8f1c60d2012-02-17 14:25:17 -08003176#endif
Scott Andersonb0114cb2012-04-09 14:08:22 -07003177 CloseDevice(fd);
3178
3179 logprintf(9, "Log: Completed %d (disk %s): disk thread status %d, "
3180 "%d pages copied\n",
3181 thread_num_, device_name_.c_str(), status_, pages_copied_);
3182 return result;
3183}
3184
3185RandomDiskThread::RandomDiskThread(DiskBlockTable *block_table)
3186 : DiskThread(block_table) {
3187 update_block_table_ = 0;
3188}
3189
3190RandomDiskThread::~RandomDiskThread() {
3191}
3192
3193// Workload for random disk thread.
3194bool RandomDiskThread::DoWork(int fd) {
3195 logprintf(11, "Log: Random phase for disk %s (thread %d).\n",
3196 device_name_.c_str(), thread_num_);
3197 while (IsReadyToRun()) {
3198 BlockData *block = block_table_->GetRandomBlock();
3199 if (block == NULL) {
3200 logprintf(12, "Log: No block available for device %s (thread %d).\n",
3201 device_name_.c_str(), thread_num_);
3202 } else {
3203 ValidateBlockOnDisk(fd, block);
3204 block_table_->ReleaseBlock(block);
3205 blocks_read_++;
3206 }
3207 }
3208 pages_copied_ = blocks_read_;
3209 return true;
3210}
3211
3212MemoryRegionThread::MemoryRegionThread() {
3213 error_injection_ = false;
3214 pages_ = NULL;
3215}
3216
3217MemoryRegionThread::~MemoryRegionThread() {
3218 if (pages_ != NULL)
3219 delete pages_;
3220}
3221
3222// Set a region of memory or MMIO to be tested.
3223// Return false if region could not be mapped.
3224bool MemoryRegionThread::SetRegion(void *region, int64 size) {
3225 int plength = sat_->page_length();
3226 int npages = size / plength;
3227 if (size % plength) {
3228 logprintf(0, "Process Error: region size is not a multiple of SAT "
3229 "page length\n");
3230 return false;
3231 } else {
3232 if (pages_ != NULL)
3233 delete pages_;
3234 pages_ = new PageEntryQueue(npages);
3235 char *base_addr = reinterpret_cast<char*>(region);
3236 region_ = base_addr;
3237 for (int i = 0; i < npages; i++) {
3238 struct page_entry pe;
3239 init_pe(&pe);
3240 pe.addr = reinterpret_cast<void*>(base_addr + i * plength);
3241 pe.offset = i * plength;
3242
3243 pages_->Push(&pe);
3244 }
3245 return true;
3246 }
3247}
3248
3249// More detailed error printout for hardware errors in memory or MMIO
3250// regions.
3251void MemoryRegionThread::ProcessError(struct ErrorRecord *error,
3252 int priority,
3253 const char *message) {
3254 uint32 buffer_offset;
3255 if (phase_ == kPhaseCopy) {
3256 // If the error occurred on the Copy Phase, it means that
3257 // the source data (i.e., the main memory) is wrong. so
3258 // just pass it to the original ProcessError to call a
3259 // bad-dimm error
3260 WorkerThread::ProcessError(error, priority, message);
3261 } else if (phase_ == kPhaseCheck) {
3262 // A error on the Check Phase means that the memory region tested
3263 // has an error. Gathering more information and then reporting
3264 // the error.
3265 // Determine if this is a write or read error.
3266 os_->Flush(error->vaddr);
3267 error->reread = *(error->vaddr);
3268 char *good = reinterpret_cast<char*>(&(error->expected));
3269 char *bad = reinterpret_cast<char*>(&(error->actual));
3270 sat_assert(error->expected != error->actual);
3271 unsigned int offset = 0;
3272 for (offset = 0; offset < (sizeof(error->expected) - 1); offset++) {
3273 if (good[offset] != bad[offset])
3274 break;
3275 }
3276
3277 error->vbyteaddr = reinterpret_cast<char*>(error->vaddr) + offset;
3278
3279 buffer_offset = error->vbyteaddr - region_;
3280
3281 // Find physical address if possible.
3282 error->paddr = os_->VirtualToPhysical(error->vbyteaddr);
3283 logprintf(priority,
3284 "%s: miscompare on %s, CRC check at %p(0x%llx), "
3285 "offset %llx: read:0x%016llx, reread:0x%016llx "
3286 "expected:0x%016llx\n",
3287 message,
3288 identifier_.c_str(),
3289 error->vaddr,
3290 error->paddr,
3291 buffer_offset,
3292 error->actual,
3293 error->reread,
3294 error->expected);
3295 } else {
3296 logprintf(0, "Process Error: memory region thread raised an "
3297 "unexpected error.");
3298 }
3299}
3300
3301// Workload for testion memory or MMIO regions.
3302// Return false on software error.
3303bool MemoryRegionThread::Work() {
3304 struct page_entry source_pe;
3305 struct page_entry memregion_pe;
3306 bool result = true;
3307 int64 loops = 0;
3308 const uint64 error_constant = 0x00ba00000000ba00LL;
3309
3310 // For error injection.
3311 int64 *addr = 0x0;
3312 int offset = 0;
3313 int64 data = 0;
3314
3315 logprintf(9, "Log: Starting Memory Region thread %d\n", thread_num_);
3316
3317 while (IsReadyToRun()) {
3318 // Getting pages from SAT and queue.
3319 phase_ = kPhaseNoPhase;
3320 result = result && sat_->GetValid(&source_pe);
3321 if (!result) {
3322 logprintf(0, "Process Error: memory region thread failed to pop "
3323 "pages from SAT, bailing\n");
3324 break;
3325 }
3326
3327 result = result && pages_->PopRandom(&memregion_pe);
3328 if (!result) {
3329 logprintf(0, "Process Error: memory region thread failed to pop "
3330 "pages from queue, bailing\n");
3331 break;
3332 }
3333
3334 // Error injection for CRC copy.
3335 if ((sat_->error_injection() || error_injection_) && loops == 1) {
3336 addr = reinterpret_cast<int64*>(source_pe.addr);
3337 offset = random() % (sat_->page_length() / wordsize_);
3338 data = addr[offset];
3339 addr[offset] = error_constant;
3340 }
3341
3342 // Copying SAT page into memory region.
3343 phase_ = kPhaseCopy;
3344 CrcCopyPage(&memregion_pe, &source_pe);
3345 memregion_pe.pattern = source_pe.pattern;
3346
3347 // Error injection for CRC Check.
3348 if ((sat_->error_injection() || error_injection_) && loops == 2) {
3349 addr = reinterpret_cast<int64*>(memregion_pe.addr);
3350 offset = random() % (sat_->page_length() / wordsize_);
3351 data = addr[offset];
3352 addr[offset] = error_constant;
3353 }
3354
3355 // Checking page content in memory region.
3356 phase_ = kPhaseCheck;
3357 CrcCheckPage(&memregion_pe);
3358
3359 phase_ = kPhaseNoPhase;
3360 // Storing pages on their proper queues.
3361 result = result && sat_->PutValid(&source_pe);
3362 if (!result) {
3363 logprintf(0, "Process Error: memory region thread failed to push "
3364 "pages into SAT, bailing\n");
3365 break;
3366 }
3367 result = result && pages_->Push(&memregion_pe);
3368 if (!result) {
3369 logprintf(0, "Process Error: memory region thread failed to push "
3370 "pages into queue, bailing\n");
3371 break;
3372 }
3373
3374 if ((sat_->error_injection() || error_injection_) &&
3375 loops >= 1 && loops <= 2) {
3376 addr[offset] = data;
3377 }
3378
3379 loops++;
3380 YieldSelf();
3381 }
3382
3383 pages_copied_ = loops;
3384 status_ = result;
3385 logprintf(9, "Log: Completed %d: Memory Region thread. Status %d, %d "
3386 "pages checked\n", thread_num_, status_, pages_copied_);
3387 return result;
3388}