Make strace somewhat resilient against process disappearing
under its claws. Prime example is sudden SIGKILL.
Fixes RH#472053
diff --git a/ChangeLog b/ChangeLog
index d6d0c94..e181e68 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,21 @@
2008-12-17 Denys Vlasenko <dvlasenk@redhat.com>
+ Make strace detect when traced process suddenly disappeared
+ (prime example is randomly arriving SIGKILL).
+ * defs.h (do_ptrace, ptrace_restart): Declare new functions
+ * process.c: Use ptrace_restart instead of bare ptrace.
+ This catches and records ESRCH errors.
+ Print "<unavailable>" if syscall decode or result can't be
+ determined because of an earlier error in ptrace()
+ * syscall.c (trace_syscall): Stop indiscriminately bailing out
+ on errors, print "syscall(????" or even "????(????" but continue.
+ * util.c (do_ptrace, ptrace_restart): Define new functions.
+ (upeek): use do_ptrace in order to catch and record ESRCH.
+ Do not print error message in this case.
+ Fixes RH#472053.
+
+2008-12-17 Denys Vlasenko <dvlasenk@redhat.com>
+
* signal.c (sys_sigaction, sys_rt_sigaction): Fix typo
in (sa_handler == SIG_IGN) comparison, it was using SIG_DFL
instead.
diff --git a/defs.h b/defs.h
index e349885..f943ea9 100644
--- a/defs.h
+++ b/defs.h
@@ -336,6 +336,7 @@
prstatus_t status; /* procfs status structure */
#endif
#endif
+ int ptrace_errno;
#ifdef FREEBSD
struct procfs_status status;
int pfd_reg;
@@ -466,6 +467,8 @@
extern void qualify P((char *));
extern int get_scno P((struct tcb *));
extern long known_scno P((struct tcb *));
+extern long do_ptrace P((int request, struct tcb *tcp, void *addr, void *data));
+extern int ptrace_restart P((int request, struct tcb *tcp, int sig));
extern int trace_syscall P((struct tcb *));
extern int count_syscall P((struct tcb *, struct timeval *));
extern void printxval P((const struct xlat *, int, const char *));
diff --git a/process.c b/process.c
index 8653c88..d0ca23b 100644
--- a/process.c
+++ b/process.c
@@ -963,10 +963,8 @@
clearbpt(tcpchild);
tcpchild->flags &= ~(TCB_SUSPENDED|TCB_STARTUP);
- if (ptrace(PTRACE_SYSCALL, pid, (char *) 1, 0) < 0) {
- perror("resume: ptrace(PTRACE_SYSCALL, ...)");
+ if (ptrace_restart(PTRACE_SYSCALL, tcpchild, 0) < 0)
return -1;
- }
if (!qflag)
fprintf(stderr, "\
diff --git a/strace.c b/strace.c
index 29eb9cb..13459d8 100644
--- a/strace.c
+++ b/strace.c
@@ -1372,10 +1372,8 @@
tcp->parent->nclone_waiting--;
#endif
- if (ptrace(PTRACE_SYSCALL, tcp->pid, (char *) 1, 0) < 0) {
- perror("resume: ptrace(PTRACE_SYSCALL, ...)");
+ if (ptrace_restart(PTRACE_SYSCALL, tcp, 0) < 0)
return -1;
- }
if (!qflag)
fprintf(stderr, "Process %u resumed\n", tcp->pid);
@@ -1547,21 +1545,14 @@
break;
}
if (WSTOPSIG(status) == SIGSTOP) {
- if ((error = ptrace(PTRACE_DETACH,
- tcp->pid, (char *) 1, sig)) < 0) {
- if (errno != ESRCH)
- perror("detach: ptrace(PTRACE_DETACH, ...)");
- /* I died trying. */
- }
+ ptrace_restart(PTRACE_DETACH, tcp, sig);
break;
}
- if ((error = ptrace(PTRACE_CONT, tcp->pid, (char *) 1,
- WSTOPSIG(status) == SIGTRAP ?
- 0 : WSTOPSIG(status))) < 0) {
- if (errno != ESRCH)
- perror("detach: ptrace(PTRACE_CONT, ...)");
+ error = ptrace_restart(PTRACE_CONT, tcp,
+ WSTOPSIG(status) == SIGTRAP ? 0
+ : WSTOPSIG(status));
+ if (error < 0)
break;
- }
}
#endif /* LINUX */
@@ -1570,8 +1561,7 @@
if (sig && kill(tcp->pid, sig) < 0)
perror("detach: kill");
sig = 0;
- if ((error = ptrace(PTRACE_DETACH, tcp->pid, (char *) 1, sig)) < 0)
- perror("detach: ptrace(PTRACE_DETACH, ...)");
+ error = ptrace_restart(PTRACE_DETACH, tcp, sig);
#endif /* SUNOS4 */
#ifndef USE_PROCFS
@@ -2174,17 +2164,16 @@
detach(tcp, sig);
if (leader != NULL && leader != tcp)
leader->flags |= TCB_GROUP_EXITING;
- }
- else if (ptrace(PTRACE_CONT, tcp->pid, (char *) 1, sig) < 0) {
- perror("strace: ptrace(PTRACE_CONT, ...)");
- cleanup();
- return -1;
- }
- else {
- if (leader != NULL)
+ } else {
+ if (ptrace_restart(PTRACE_CONT, tcp, sig) < 0) {
+ cleanup();
+ return -1;
+ }
+ if (leader != NULL) {
leader->flags |= TCB_GROUP_EXITING;
- if (leader != NULL && leader != tcp)
- droptcb(tcp);
+ if (leader != tcp)
+ droptcb(tcp);
+ }
/* The leader will report to us as parent now,
and then we'll get to the SIG==-1 case. */
return 0;
@@ -2429,9 +2418,7 @@
* Hope we are back in control now.
*/
tcp->flags &= ~(TCB_INSYSCALL | TCB_SIGTRAPPED);
- if (ptrace(PTRACE_SYSCALL,
- pid, (char *) 1, 0) < 0) {
- perror("trace: ptrace(PTRACE_SYSCALL, ...)");
+ if (ptrace_restart(PTRACE_SYSCALL, tcp, 0) < 0) {
cleanup();
return -1;
}
@@ -2478,9 +2465,7 @@
#endif
continue;
}
- if (ptrace(PTRACE_SYSCALL, pid, (char *) 1,
- WSTOPSIG(status)) < 0) {
- perror("trace: ptrace(PTRACE_SYSCALL, ...)");
+ if (ptrace_restart(PTRACE_SYSCALL, tcp, WSTOPSIG(status)) < 0) {
cleanup();
return -1;
}
@@ -2490,7 +2475,7 @@
/* we handled the STATUS, we are permitted to interrupt now. */
if (interrupted)
return 0;
- if (trace_syscall(tcp) < 0) {
+ if (trace_syscall(tcp) < 0 && !tcp->ptrace_errno) {
if (tcp->flags & TCB_ATTACHED)
detach(tcp, 0);
else {
@@ -2510,8 +2495,7 @@
#endif
if (tcp->flags & TCB_ATTACHED)
detach(tcp, 0);
- else if (ptrace(PTRACE_CONT, pid, (char *) 1, 0) < 0) {
- perror("strace: ptrace(PTRACE_CONT, ...)");
+ else if (ptrace_restart(PTRACE_CONT, tcp, 0) < 0) {
cleanup();
return -1;
}
@@ -2523,8 +2507,7 @@
continue;
}
tracing:
- if (ptrace(PTRACE_SYSCALL, pid, (char *) 1, 0) < 0) {
- perror("trace: ptrace(PTRACE_SYSCALL, ...)");
+ if (ptrace_restart(PTRACE_SYSCALL, tcp, 0) < 0) {
cleanup();
return -1;
}
@@ -2572,9 +2555,18 @@
printleader(tcp)
struct tcb *tcp;
{
- if (tcp_last && (!outfname || followfork < 2 || tcp_last == tcp)) {
- tcp_last->flags |= TCB_REPRINT;
- tprintf(" <unfinished ...>\n");
+ if (tcp_last) {
+ if (tcp_last->ptrace_errno) {
+ if (tcp_last->flags & TCB_INSYSCALL) {
+ tprintf(" <unavailable>)");
+ tabto(acolumn);
+ }
+ tprintf("= ? <unavailable>\n");
+ tcp_last->ptrace_errno = 0;
+ } else if (!outfname || followfork < 2 || tcp_last == tcp) {
+ tcp_last->flags |= TCB_REPRINT;
+ tprintf(" <unfinished ...>\n");
+ }
}
curcol = 0;
if ((followfork == 1 || pflag_seen > 1) && outfname)
diff --git a/syscall.c b/syscall.c
index c2940c3..7d570d1 100644
--- a/syscall.c
+++ b/syscall.c
@@ -2319,28 +2319,30 @@
{
int sys_res;
struct timeval tv;
- int res;
-
- /* Measure the exit time as early as possible to avoid errors. */
- if (dtime && (tcp->flags & TCB_INSYSCALL))
- gettimeofday(&tv, NULL);
-
- res = get_scno(tcp);
- if (res != 1)
- return res;
-
- res = syscall_fixup(tcp);
- if (res != 1)
- return res;
+ int res, scno_good;
if (tcp->flags & TCB_INSYSCALL) {
long u_error;
- res = get_error(tcp);
- if (res != 1)
- return res;
- internal_syscall(tcp);
- if (tcp->scno >= 0 && tcp->scno < nsyscalls &&
+ /* Measure the exit time as early as possible to avoid errors. */
+ if (dtime)
+ gettimeofday(&tv, NULL);
+
+ scno_good = res = get_scno(tcp);
+ if (res == 0)
+ return res;
+ if (res == 1)
+ res = syscall_fixup(tcp);
+ if (res == 0)
+ return res;
+ if (res == 1)
+ res = get_error(tcp);
+ if (res == 0)
+ return res;
+ if (res == 1)
+ internal_syscall(tcp);
+
+ if (res == 1 && tcp->scno >= 0 && tcp->scno < nsyscalls &&
!(qual_flags[tcp->scno] & QUAL_TRACE)) {
tcp->flags &= ~TCB_INSYSCALL;
return 0;
@@ -2349,7 +2351,9 @@
if (tcp->flags & TCB_REPRINT) {
printleader(tcp);
tprintf("<... ");
- if (tcp->scno >= nsyscalls || tcp->scno < 0)
+ if (scno_good != 1)
+ tprintf("????");
+ else if (tcp->scno >= nsyscalls || tcp->scno < 0)
tprintf("syscall_%lu", tcp->scno);
else
tprintf("%s", sysent[tcp->scno].sys_name);
@@ -2359,6 +2363,13 @@
if (cflag)
return count_syscall(tcp, &tv);
+ if (res != 1) {
+ tprintf(") ");
+ tabto(acolumn);
+ tcp->flags &= ~TCB_INSYSCALL;
+ return res;
+ }
+
if (tcp->scno >= nsyscalls || tcp->scno < 0
|| (qual_flags[tcp->scno] & QUAL_RAW))
sys_res = printargs(tcp);
@@ -2463,9 +2474,35 @@
}
/* Entering system call */
- res = syscall_enter(tcp);
- if (res != 1)
+ scno_good = res = get_scno(tcp);
+ if (res == 0)
return res;
+ if (res == 1)
+ res = syscall_fixup(tcp);
+ if (res == 0)
+ return res;
+ if (res == 1)
+ res = syscall_enter(tcp);
+ if (res == 0)
+ return res;
+
+ if (res != 1) {
+ printleader(tcp);
+ tcp->flags &= ~TCB_REPRINT;
+ tcp_last = tcp;
+ if (scno_good != 1)
+ tprintf("????" /* anti-trigraph gap */ "(");
+ else if (tcp->scno >= nsyscalls || tcp->scno < 0)
+ tprintf("syscall_%lu(", tcp->scno);
+ else
+ tprintf("%s(", sysent[tcp->scno].sys_name);
+ /*
+ * " <unavailable>" will be added later by the code which
+ * detects ptrace errors.
+ */
+ tcp->flags |= TCB_INSYSCALL;
+ return res;
+ }
switch (known_scno(tcp)) {
#ifdef SYS_socket_subcall
diff --git a/util.c b/util.c
index d64dd84..8f97811 100644
--- a/util.c
+++ b/util.c
@@ -241,6 +241,61 @@
}
/*
+ * Generic ptrace wrapper which tracks ESRCH errors
+ * by setting tcp->ptrace_errno to ESRCH.
+ *
+ * We assume that ESRCH indicates likely process death (SIGKILL?),
+ * modulo bugs where process somehow ended up not stopped.
+ * Unfortunately kernel uses ESRCH for that case too. Oh well.
+ *
+ * Currently used by upeek() only.
+ * TODO: use this in all other ptrace() calls while decoding.
+ */
+long
+do_ptrace(int request, struct tcb *tcp, void *addr, void *data)
+{
+ long l;
+
+ errno = 0;
+ l = ptrace(request, tcp->pid, addr, data);
+ /* Non-ESRCH errors might be our invalid reg/mem accesses,
+ * we do not record them. */
+ if (errno == ESRCH)
+ tcp->ptrace_errno = ESRCH;
+ return l;
+}
+
+/*
+ * Used when we want to unblock stopped traced process.
+ * Should be only used with PTRACE_CONT, PTRACE_DETACH and PTRACE_SYSCALL.
+ * Returns 0 on success or if error was ESRCH
+ * (presumably process was killed while we talk to it).
+ * Otherwise prints error message and returns -1.
+ */
+int
+ptrace_restart(int op, struct tcb *tcp, int sig)
+{
+ int err;
+ const char *msg;
+
+ errno = 0;
+ ptrace(op, tcp->pid, (void *) 1, (void *) (long) sig);
+ err = errno;
+ if (!err || err == ESRCH)
+ return 0;
+
+ tcp->ptrace_errno = err;
+ msg = "SYSCALL";
+ if (op == PTRACE_CONT)
+ msg = "CONT";
+ if (op == PTRACE_DETACH)
+ msg = "DETACH";
+ fprintf(stderr, "strace: ptrace(PTRACE_%s,1,%d): %s\n",
+ msg, sig, strerror(err));
+ return -1;
+}
+
+/*
* Print entry in struct xlat table, if there.
*/
void
@@ -1078,11 +1133,13 @@
}
#endif /* SUNOS4_KERNEL_ARCH_KLUDGE */
errno = 0;
- val = ptrace(PTRACE_PEEKUSER, tcp->pid, (char *) off, 0);
+ val = do_ptrace(PTRACE_PEEKUSER, tcp, (char *) off, 0);
if (val == -1 && errno) {
- char buf[60];
- sprintf(buf,"upeek: ptrace(PTRACE_PEEKUSER,%d,%lu,0)", tcp->pid, off);
- perror(buf);
+ if (errno != ESRCH) {
+ char buf[60];
+ sprintf(buf,"upeek: ptrace(PTRACE_PEEKUSER,%d,%lu,0)", tcp->pid, off);
+ perror(buf);
+ }
return -1;
}
*res = val;