Make strace somewhat resilient against process disappearing
under its claws. Prime example is sudden SIGKILL.
Fixes RH#472053
diff --git a/ChangeLog b/ChangeLog
index d6d0c94..e181e68 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,21 @@
 2008-12-17  Denys Vlasenko  <dvlasenk@redhat.com>
 
+	Make strace detect when traced process suddenly disappeared
+	(prime example is randomly arriving SIGKILL).
+	* defs.h (do_ptrace, ptrace_restart): Declare new functions
+	* process.c: Use ptrace_restart instead of bare ptrace.
+	This catches and records ESRCH errors.
+	Print "<unavailable>" if syscall decode or result can't be
+	determined because of an earlier error in ptrace()
+	* syscall.c (trace_syscall): Stop indiscriminately bailing out
+	on errors, print "syscall(????" or even "????(????" but continue.
+	* util.c (do_ptrace, ptrace_restart): Define new functions.
+	(upeek): use do_ptrace in order to catch and record ESRCH.
+	Do not print error message in this case.
+	Fixes RH#472053.
+
+2008-12-17  Denys Vlasenko  <dvlasenk@redhat.com>
+
 	* signal.c (sys_sigaction, sys_rt_sigaction): Fix typo
 	in (sa_handler == SIG_IGN) comparison, it was using SIG_DFL
 	instead.
diff --git a/defs.h b/defs.h
index e349885..f943ea9 100644
--- a/defs.h
+++ b/defs.h
@@ -336,6 +336,7 @@
 	prstatus_t status;	/* procfs status structure */
 #endif
 #endif
+	int ptrace_errno;
 #ifdef FREEBSD
 	struct procfs_status status;
 	int pfd_reg;
@@ -466,6 +467,8 @@
 extern void qualify P((char *));
 extern int get_scno P((struct tcb *));
 extern long known_scno P((struct tcb *));
+extern long do_ptrace P((int request, struct tcb *tcp, void *addr, void *data));
+extern int ptrace_restart P((int request, struct tcb *tcp, int sig));
 extern int trace_syscall P((struct tcb *));
 extern int count_syscall P((struct tcb *, struct timeval *));
 extern void printxval P((const struct xlat *, int, const char *));
diff --git a/process.c b/process.c
index 8653c88..d0ca23b 100644
--- a/process.c
+++ b/process.c
@@ -963,10 +963,8 @@
 				clearbpt(tcpchild);
 
 			tcpchild->flags &= ~(TCB_SUSPENDED|TCB_STARTUP);
-			if (ptrace(PTRACE_SYSCALL, pid, (char *) 1, 0) < 0) {
-				perror("resume: ptrace(PTRACE_SYSCALL, ...)");
+			if (ptrace_restart(PTRACE_SYSCALL, tcpchild, 0) < 0)
 				return -1;
-			}
 
 			if (!qflag)
 				fprintf(stderr, "\
diff --git a/strace.c b/strace.c
index 29eb9cb..13459d8 100644
--- a/strace.c
+++ b/strace.c
@@ -1372,10 +1372,8 @@
 		tcp->parent->nclone_waiting--;
 #endif
 
-	if (ptrace(PTRACE_SYSCALL, tcp->pid, (char *) 1, 0) < 0) {
-		perror("resume: ptrace(PTRACE_SYSCALL, ...)");
+	if (ptrace_restart(PTRACE_SYSCALL, tcp, 0) < 0)
 		return -1;
-	}
 
 	if (!qflag)
 		fprintf(stderr, "Process %u resumed\n", tcp->pid);
@@ -1547,21 +1545,14 @@
 				break;
 			}
 			if (WSTOPSIG(status) == SIGSTOP) {
-				if ((error = ptrace(PTRACE_DETACH,
-				    tcp->pid, (char *) 1, sig)) < 0) {
-					if (errno != ESRCH)
-						perror("detach: ptrace(PTRACE_DETACH, ...)");
-					/* I died trying. */
-				}
+				ptrace_restart(PTRACE_DETACH, tcp, sig);
 				break;
 			}
-			if ((error = ptrace(PTRACE_CONT, tcp->pid, (char *) 1,
-			    WSTOPSIG(status) == SIGTRAP ?
-			    0 : WSTOPSIG(status))) < 0) {
-				if (errno != ESRCH)
-					perror("detach: ptrace(PTRACE_CONT, ...)");
+			error = ptrace_restart(PTRACE_CONT, tcp,
+					WSTOPSIG(status) == SIGTRAP ? 0
+					: WSTOPSIG(status));
+			if (error < 0)
 				break;
-			}
 		}
 #endif /* LINUX */
 
@@ -1570,8 +1561,7 @@
 	if (sig && kill(tcp->pid, sig) < 0)
 		perror("detach: kill");
 	sig = 0;
-	if ((error = ptrace(PTRACE_DETACH, tcp->pid, (char *) 1, sig)) < 0)
-		perror("detach: ptrace(PTRACE_DETACH, ...)");
+	error = ptrace_restart(PTRACE_DETACH, tcp, sig);
 #endif /* SUNOS4 */
 
 #ifndef USE_PROCFS
@@ -2174,17 +2164,16 @@
 			detach(tcp, sig);
 		  	if (leader != NULL && leader != tcp)
 				leader->flags |= TCB_GROUP_EXITING;
-		}
-		else if (ptrace(PTRACE_CONT, tcp->pid, (char *) 1, sig) < 0) {
-			perror("strace: ptrace(PTRACE_CONT, ...)");
-			cleanup();
-			return -1;
-		}
-		else {
-			if (leader != NULL)
+		} else {
+			if (ptrace_restart(PTRACE_CONT, tcp, sig) < 0) {
+				cleanup();
+				return -1;
+			}
+			if (leader != NULL) {
 				leader->flags |= TCB_GROUP_EXITING;
-			if (leader != NULL && leader != tcp)
-				droptcb(tcp);
+				if (leader != tcp)
+					droptcb(tcp);
+			}
 			/* The leader will report to us as parent now,
 			   and then we'll get to the SIG==-1 case.  */
 			return 0;
@@ -2429,9 +2418,7 @@
 				 * Hope we are back in control now.
 				 */
 				tcp->flags &= ~(TCB_INSYSCALL | TCB_SIGTRAPPED);
-				if (ptrace(PTRACE_SYSCALL,
-						pid, (char *) 1, 0) < 0) {
-					perror("trace: ptrace(PTRACE_SYSCALL, ...)");
+				if (ptrace_restart(PTRACE_SYSCALL, tcp, 0) < 0) {
 					cleanup();
 					return -1;
 				}
@@ -2478,9 +2465,7 @@
 #endif
 				continue;
 			}
-			if (ptrace(PTRACE_SYSCALL, pid, (char *) 1,
-				   WSTOPSIG(status)) < 0) {
-				perror("trace: ptrace(PTRACE_SYSCALL, ...)");
+			if (ptrace_restart(PTRACE_SYSCALL, tcp, WSTOPSIG(status)) < 0) {
 				cleanup();
 				return -1;
 			}
@@ -2490,7 +2475,7 @@
 		/* we handled the STATUS, we are permitted to interrupt now. */
 		if (interrupted)
 			return 0;
-		if (trace_syscall(tcp) < 0) {
+		if (trace_syscall(tcp) < 0 && !tcp->ptrace_errno) {
 			if (tcp->flags & TCB_ATTACHED)
 				detach(tcp, 0);
 			else {
@@ -2510,8 +2495,7 @@
 #endif
 			if (tcp->flags & TCB_ATTACHED)
 				detach(tcp, 0);
-			else if (ptrace(PTRACE_CONT, pid, (char *) 1, 0) < 0) {
-				perror("strace: ptrace(PTRACE_CONT, ...)");
+			else if (ptrace_restart(PTRACE_CONT, tcp, 0) < 0) {
 				cleanup();
 				return -1;
 			}
@@ -2523,8 +2507,7 @@
 			continue;
 		}
 	tracing:
-		if (ptrace(PTRACE_SYSCALL, pid, (char *) 1, 0) < 0) {
-			perror("trace: ptrace(PTRACE_SYSCALL, ...)");
+		if (ptrace_restart(PTRACE_SYSCALL, tcp, 0) < 0) {
 			cleanup();
 			return -1;
 		}
@@ -2572,9 +2555,18 @@
 printleader(tcp)
 struct tcb *tcp;
 {
-	if (tcp_last && (!outfname || followfork < 2 || tcp_last == tcp)) {
-		tcp_last->flags |= TCB_REPRINT;
-		tprintf(" <unfinished ...>\n");
+	if (tcp_last) {
+		if (tcp_last->ptrace_errno) {
+			if (tcp_last->flags & TCB_INSYSCALL) {
+				tprintf(" <unavailable>)");
+				tabto(acolumn);
+			}
+			tprintf("= ? <unavailable>\n");
+			tcp_last->ptrace_errno = 0;
+		} else if (!outfname || followfork < 2 || tcp_last == tcp) {
+			tcp_last->flags |= TCB_REPRINT;
+			tprintf(" <unfinished ...>\n");
+		}
 	}
 	curcol = 0;
 	if ((followfork == 1 || pflag_seen > 1) && outfname)
diff --git a/syscall.c b/syscall.c
index c2940c3..7d570d1 100644
--- a/syscall.c
+++ b/syscall.c
@@ -2319,28 +2319,30 @@
 {
 	int sys_res;
 	struct timeval tv;
-	int res;
-
-	/* Measure the exit time as early as possible to avoid errors. */
-	if (dtime && (tcp->flags & TCB_INSYSCALL))
-		gettimeofday(&tv, NULL);
-
-	res = get_scno(tcp);
-	if (res != 1)
-		return res;
-
-	res = syscall_fixup(tcp);
-	if (res != 1)
-		return res;
+	int res, scno_good;
 
 	if (tcp->flags & TCB_INSYSCALL) {
 		long u_error;
-		res = get_error(tcp);
-		if (res != 1)
-			return res;
 
-		internal_syscall(tcp);
-		if (tcp->scno >= 0 && tcp->scno < nsyscalls &&
+		/* Measure the exit time as early as possible to avoid errors. */
+		if (dtime)
+			gettimeofday(&tv, NULL);
+
+		scno_good = res = get_scno(tcp);
+		if (res == 0)
+			return res;
+		if (res == 1)
+			res = syscall_fixup(tcp);
+		if (res == 0)
+			return res;
+		if (res == 1)
+			res = get_error(tcp);
+		if (res == 0)
+			return res;
+		if (res == 1)
+			internal_syscall(tcp);
+
+		if (res == 1 && tcp->scno >= 0 && tcp->scno < nsyscalls &&
 		    !(qual_flags[tcp->scno] & QUAL_TRACE)) {
 			tcp->flags &= ~TCB_INSYSCALL;
 			return 0;
@@ -2349,7 +2351,9 @@
 		if (tcp->flags & TCB_REPRINT) {
 			printleader(tcp);
 			tprintf("<... ");
-			if (tcp->scno >= nsyscalls || tcp->scno < 0)
+			if (scno_good != 1)
+				tprintf("????");
+			else if (tcp->scno >= nsyscalls || tcp->scno < 0)
 				tprintf("syscall_%lu", tcp->scno);
 			else
 				tprintf("%s", sysent[tcp->scno].sys_name);
@@ -2359,6 +2363,13 @@
 		if (cflag)
 			return count_syscall(tcp, &tv);
 
+		if (res != 1) {
+			tprintf(") ");
+			tabto(acolumn);
+			tcp->flags &= ~TCB_INSYSCALL;
+			return res;
+		}
+
 		if (tcp->scno >= nsyscalls || tcp->scno < 0
 		    || (qual_flags[tcp->scno] & QUAL_RAW))
 			sys_res = printargs(tcp);
@@ -2463,9 +2474,35 @@
 	}
 
 	/* Entering system call */
-	res = syscall_enter(tcp);
-	if (res != 1)
+	scno_good = res = get_scno(tcp);
+	if (res == 0)
 		return res;
+	if (res == 1)
+		res = syscall_fixup(tcp);
+	if (res == 0)
+		return res;
+	if (res == 1)
+		res = syscall_enter(tcp);
+	if (res == 0)
+		return res;
+
+	if (res != 1) {
+		printleader(tcp);
+		tcp->flags &= ~TCB_REPRINT;
+		tcp_last = tcp;
+		if (scno_good != 1)
+			tprintf("????" /* anti-trigraph gap */ "(");
+		else if (tcp->scno >= nsyscalls || tcp->scno < 0)
+			tprintf("syscall_%lu(", tcp->scno);
+		else
+			tprintf("%s(", sysent[tcp->scno].sys_name);
+		/*
+		 * " <unavailable>" will be added later by the code which
+		 * detects ptrace errors.
+		 */
+		tcp->flags |= TCB_INSYSCALL;
+		return res;
+	}
 
 	switch (known_scno(tcp)) {
 #ifdef SYS_socket_subcall
diff --git a/util.c b/util.c
index d64dd84..8f97811 100644
--- a/util.c
+++ b/util.c
@@ -241,6 +241,61 @@
 }
 
 /*
+ * Generic ptrace wrapper which tracks ESRCH errors
+ * by setting tcp->ptrace_errno to ESRCH.
+ *
+ * We assume that ESRCH indicates likely process death (SIGKILL?),
+ * modulo bugs where process somehow ended up not stopped.
+ * Unfortunately kernel uses ESRCH for that case too. Oh well.
+ *
+ * Currently used by upeek() only.
+ * TODO: use this in all other ptrace() calls while decoding.
+ */
+long
+do_ptrace(int request, struct tcb *tcp, void *addr, void *data)
+{
+	long l;
+
+	errno = 0;
+	l = ptrace(request, tcp->pid, addr, data);
+	/* Non-ESRCH errors might be our invalid reg/mem accesses,
+	 * we do not record them. */
+	if (errno == ESRCH)
+		tcp->ptrace_errno = ESRCH;
+	return l;
+}
+
+/*
+ * Used when we want to unblock stopped traced process.
+ * Should be only used with PTRACE_CONT, PTRACE_DETACH and PTRACE_SYSCALL.
+ * Returns 0 on success or if error was ESRCH
+ * (presumably process was killed while we talk to it).
+ * Otherwise prints error message and returns -1.
+ */
+int
+ptrace_restart(int op, struct tcb *tcp, int sig)
+{
+	int err;
+	const char *msg;
+
+	errno = 0;
+	ptrace(op, tcp->pid, (void *) 1, (void *) (long) sig);
+	err = errno;
+	if (!err || err == ESRCH)
+		return 0;
+
+	tcp->ptrace_errno = err;
+	msg = "SYSCALL";
+	if (op == PTRACE_CONT)
+		msg = "CONT";
+	if (op == PTRACE_DETACH)
+		msg = "DETACH";
+	fprintf(stderr, "strace: ptrace(PTRACE_%s,1,%d): %s\n",
+			msg, sig, strerror(err));
+	return -1;
+}
+
+/*
  * Print entry in struct xlat table, if there.
  */
 void
@@ -1078,11 +1133,13 @@
 	}
 #endif /* SUNOS4_KERNEL_ARCH_KLUDGE */
 	errno = 0;
-	val = ptrace(PTRACE_PEEKUSER, tcp->pid, (char *) off, 0);
+	val = do_ptrace(PTRACE_PEEKUSER, tcp, (char *) off, 0);
 	if (val == -1 && errno) {
-		char buf[60];
-		sprintf(buf,"upeek: ptrace(PTRACE_PEEKUSER,%d,%lu,0)", tcp->pid, off);
-		perror(buf);
+		if (errno != ESRCH) {
+			char buf[60];
+			sprintf(buf,"upeek: ptrace(PTRACE_PEEKUSER,%d,%lu,0)", tcp->pid, off);
+			perror(buf);
+		}
 		return -1;
 	}
 	*res = val;