Daniel Axtens | a8da474 | 2015-12-07 10:50:51 +1100 | [diff] [blame] | 1 | #!/bin/sh |
| 2 | # |
| 3 | # Copyright 2015, Daniel Axtens, IBM Corporation |
| 4 | # |
| 5 | # This program is free software; you can redistribute it and/or modify |
| 6 | # it under the terms of the GNU General Public License as published by |
| 7 | # the Free Software Foundation; version 2 of the License. |
| 8 | # |
| 9 | # This program is distributed in the hope that it will be useful, |
| 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 12 | # GNU General Public License for more details. |
| 13 | |
| 14 | |
| 15 | # do we have ./getscom, ./putscom? |
| 16 | if [ -x ./getscom ] && [ -x ./putscom ]; then |
| 17 | GETSCOM=./getscom |
| 18 | PUTSCOM=./putscom |
| 19 | elif which getscom > /dev/null; then |
| 20 | GETSCOM=$(which getscom) |
| 21 | PUTSCOM=$(which putscom) |
| 22 | else |
| 23 | cat <<EOF |
| 24 | Can't find getscom/putscom in . or \$PATH. |
| 25 | See https://github.com/open-power/skiboot. |
| 26 | The tool is in external/xscom-utils |
| 27 | EOF |
| 28 | exit 1 |
| 29 | fi |
| 30 | |
| 31 | # We will get 8 HMI events per injection |
| 32 | # todo: deal with things being offline |
| 33 | expected_hmis=8 |
| 34 | COUNT_HMIS() { |
| 35 | dmesg | grep -c 'Harmless Hypervisor Maintenance interrupt' |
| 36 | } |
| 37 | |
| 38 | # massively expand snooze delay, allowing injection on all cores |
| 39 | ppc64_cpu --smt-snooze-delay=1000000000 |
| 40 | |
| 41 | # when we exit, restore it |
| 42 | trap "ppc64_cpu --smt-snooze-delay=100" 0 1 |
| 43 | |
| 44 | # for each chip+core combination |
| 45 | # todo - less fragile parsing |
| 46 | egrep -o 'OCC: Chip [0-9a-f]+ Core [0-9a-f]' < /sys/firmware/opal/msglog | |
| 47 | while read chipcore; do |
| 48 | chip=$(echo "$chipcore"|awk '{print $3}') |
| 49 | core=$(echo "$chipcore"|awk '{print $5}') |
| 50 | fir="0x1${core}013100" |
| 51 | |
| 52 | # verify that Core FIR is zero as expected |
| 53 | if [ "$($GETSCOM -c 0x${chip} $fir)" != 0 ]; then |
| 54 | echo "FIR was not zero before injection for chip $chip, core $core. Aborting!" |
| 55 | echo "Result of $GETSCOM -c 0x${chip} $fir:" |
| 56 | $GETSCOM -c 0x${chip} $fir |
| 57 | echo "If you get a -5 error, the core may be in idle state. Try stress-ng." |
| 58 | echo "Otherwise, try $PUTSCOM -c 0x${chip} $fir 0" |
| 59 | exit 1 |
| 60 | fi |
| 61 | |
| 62 | # keep track of the number of HMIs handled |
| 63 | old_hmis=$(COUNT_HMIS) |
| 64 | |
| 65 | # do injection, adding a marker to dmesg for clarity |
| 66 | echo "Injecting HMI on core $core, chip $chip" | tee /dev/kmsg |
| 67 | # inject a RegFile recoverable error |
| 68 | if ! $PUTSCOM -c 0x${chip} $fir 2000000000000000 > /dev/null; then |
| 69 | echo "Error injecting. Aborting!" |
| 70 | exit 1 |
| 71 | fi |
| 72 | |
| 73 | # now we want to wait for all the HMIs to be processed |
| 74 | # we expect one per thread on the core |
| 75 | i=0; |
| 76 | new_hmis=$(COUNT_HMIS) |
| 77 | while [ $new_hmis -lt $((old_hmis + expected_hmis)) ] && [ $i -lt 12 ]; do |
| 78 | echo "Seen $((new_hmis - old_hmis)) HMI(s) out of $expected_hmis expected, sleeping" |
| 79 | sleep 5; |
| 80 | i=$((i + 1)) |
| 81 | new_hmis=$(COUNT_HMIS) |
| 82 | done |
| 83 | if [ $i = 12 ]; then |
| 84 | echo "Haven't seen expected $expected_hmis recoveries after 1 min. Aborting." |
| 85 | exit 1 |
| 86 | fi |
| 87 | echo "Processed $expected_hmis events; presumed success. Check dmesg." |
| 88 | echo "" |
| 89 | done |