get_scno is an unholy mess, make it less horrible

Currently, get_scno does *much* more than "get syscall no".
It checks for post-execve SIGTRAP. It checks for changes
in personality. It retrieves params on entry and registers on exit.
Worse still, it is different in different architectures: for example,
for AVR32 regs are fetched in get_scno(), while for e.g. I386
it is done in syscall_enter().

Another problem is that get_scno() is called on both syscall entry and
syscall exit, which is stupid: we don't need to know scno on syscall
exit, it is already known from last syscall entry and stored in
tcp->scno! In essence, get_scno() does two completely different things
on syscall entry and on exit, they are just mixed into one bottle, like
shampoo and conditioner.

The following patches will try to improve this situation.

This change duplicates get_scno into identical get_scno_on_sysenter,
get_scno_on_sysexit functions. Call them in syscall enter and syscall
exit, correspondingly.

* defs.h: Rename get_scno to get_scno_on_sysenter; declare it only
if USE_PROCFS.
* strace.c (proc_open): Call get_scno_on_sysenter instead of get_scno.
* syscall.c (get_scno): Split into two (so far identical) functions
get_scno_on_sysenter and get_scno_on_sysexit.
(trace_syscall_entering): Call get_scno_on_sysenter instead of get_scno.
(trace_syscall_exiting): Call get_scno_on_sysexit instead of get_scno.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
diff --git a/syscall.c b/syscall.c
index a3b5727..cb4bef9 100644
--- a/syscall.c
+++ b/syscall.c
@@ -744,8 +744,11 @@
  * other: error, trace_syscall() should print error indicator
  *    ("????" etc) and bail out.
  */
+#ifndef USE_PROCFS
+static
+#endif
 int
-get_scno(struct tcb *tcp)
+get_scno_on_sysenter(struct tcb *tcp)
 {
 	long scno = 0;
 
@@ -1352,6 +1355,619 @@
 	return 1;
 }
 
+/* Returns:
+ * 0: "ignore this ptrace stop", bail out of trace_syscall() silently.
+ * 1: ok, continue in trace_syscall().
+ * other: error, trace_syscall() should print error indicator
+ *    ("????" etc) and bail out.
+ */
+static int
+get_scno_on_sysexit(struct tcb *tcp)
+{
+	long scno = 0;
+
+#ifdef LINUX
+# if defined(S390) || defined(S390X)
+	if (tcp->flags & TCB_WAITEXECVE) {
+		/*
+		 * When the execve system call completes successfully, the
+		 * new process still has -ENOSYS (old style) or __NR_execve
+		 * (new style) in gpr2.  We cannot recover the scno again
+		 * by disassembly, because the image that executed the
+		 * syscall is gone now.  Fortunately, we don't want it.  We
+		 * leave the flag set so that syscall_fixup can fake the
+		 * result.
+		 */
+		if (exiting(tcp))
+			return 1;
+		/*
+		 * This is the post-execve SIGTRAP.  We cannot try to read
+		 * the system call here either.
+		 */
+		tcp->flags &= ~TCB_WAITEXECVE;
+		return 0;
+	}
+
+	if (upeek(tcp, PT_GPR2, &syscall_mode) < 0)
+		return -1;
+
+	if (syscall_mode != -ENOSYS) {
+		/*
+		 * Since kernel version 2.5.44 the scno gets passed in gpr2.
+		 */
+		scno = syscall_mode;
+	} else {
+		/*
+		 * Old style of "passing" the scno via the SVC instruction.
+		 */
+
+		long opcode, offset_reg, tmp;
+		void * svc_addr;
+		static const int gpr_offset[16] = {
+				PT_GPR0,  PT_GPR1,  PT_ORIGGPR2, PT_GPR3,
+				PT_GPR4,  PT_GPR5,  PT_GPR6,     PT_GPR7,
+				PT_GPR8,  PT_GPR9,  PT_GPR10,    PT_GPR11,
+				PT_GPR12, PT_GPR13, PT_GPR14,    PT_GPR15
+		};
+
+		if (upeek(tcp, PT_PSWADDR, &pc) < 0)
+			return -1;
+		errno = 0;
+		opcode = ptrace(PTRACE_PEEKTEXT, tcp->pid, (char *)(pc-sizeof(long)), 0);
+		if (errno) {
+			perror("peektext(pc-oneword)");
+			return -1;
+		}
+
+		/*
+		 *  We have to check if the SVC got executed directly or via an
+		 *  EXECUTE instruction. In case of EXECUTE it is necessary to do
+		 *  instruction decoding to derive the system call number.
+		 *  Unfortunately the opcode sizes of EXECUTE and SVC are differently,
+		 *  so that this doesn't work if a SVC opcode is part of an EXECUTE
+		 *  opcode. Since there is no way to find out the opcode size this
+		 *  is the best we can do...
+		 */
+
+		if ((opcode & 0xff00) == 0x0a00) {
+			/* SVC opcode */
+			scno = opcode & 0xff;
+		}
+		else {
+			/* SVC got executed by EXECUTE instruction */
+
+			/*
+			 *  Do instruction decoding of EXECUTE. If you really want to
+			 *  understand this, read the Principles of Operations.
+			 */
+			svc_addr = (void *) (opcode & 0xfff);
+
+			tmp = 0;
+			offset_reg = (opcode & 0x000f0000) >> 16;
+			if (offset_reg && (upeek(tcp, gpr_offset[offset_reg], &tmp) < 0))
+				return -1;
+			svc_addr += tmp;
+
+			tmp = 0;
+			offset_reg = (opcode & 0x0000f000) >> 12;
+			if (offset_reg && (upeek(tcp, gpr_offset[offset_reg], &tmp) < 0))
+				return -1;
+			svc_addr += tmp;
+
+			scno = ptrace(PTRACE_PEEKTEXT, tcp->pid, svc_addr, 0);
+			if (errno)
+				return -1;
+#  if defined(S390X)
+			scno >>= 48;
+#  else
+			scno >>= 16;
+#  endif
+			tmp = 0;
+			offset_reg = (opcode & 0x00f00000) >> 20;
+			if (offset_reg && (upeek(tcp, gpr_offset[offset_reg], &tmp) < 0))
+				return -1;
+
+			scno = (scno | tmp) & 0xff;
+		}
+	}
+# elif defined (POWERPC)
+	if (upeek(tcp, sizeof(unsigned long)*PT_R0, &scno) < 0)
+		return -1;
+	if (entering(tcp)) {
+		/* Check if this is the post-execve SIGTRAP. */
+		if (scno == 0 && (tcp->flags & TCB_WAITEXECVE)) {
+			tcp->flags &= ~TCB_WAITEXECVE;
+			return 0;
+		}
+	}
+
+#  ifdef POWERPC64
+	if (entering(tcp)) {
+		/* TODO: speed up strace by not doing this at every syscall.
+		 * We only need to do it after execve.
+		 */
+		int currpers;
+		long val;
+		int pid = tcp->pid;
+
+		/* Check for 64/32 bit mode. */
+		if (upeek(tcp, sizeof(unsigned long)*PT_MSR, &val) < 0)
+			return -1;
+		/* SF is bit 0 of MSR */
+		if (val < 0)
+			currpers = 0;
+		else
+			currpers = 1;
+		if (currpers != current_personality) {
+			static const char *const names[] = {"64 bit", "32 bit"};
+			set_personality(currpers);
+			fprintf(stderr, "[ Process PID=%d runs in %s mode. ]\n",
+					pid, names[current_personality]);
+		}
+	}
+#  endif
+# elif defined(AVR32)
+	/*
+	 * Read complete register set in one go.
+	 */
+	if (ptrace(PTRACE_GETREGS, tcp->pid, NULL, &regs) < 0)
+		return -1;
+
+	/*
+	 * We only need to grab the syscall number on syscall entry.
+	 */
+	if (entering(tcp)) {
+		scno = regs.r8;
+
+		/* Check if this is the post-execve SIGTRAP. */
+		if (tcp->flags & TCB_WAITEXECVE) {
+			tcp->flags &= ~TCB_WAITEXECVE;
+			return 0;
+		}
+	}
+# elif defined(BFIN)
+	if (upeek(tcp, PT_ORIG_P0, &scno))
+		return -1;
+# elif defined (I386)
+	if (upeek(tcp, 4*ORIG_EAX, &scno) < 0)
+		return -1;
+# elif defined (X86_64)
+	if (upeek(tcp, 8*ORIG_RAX, &scno) < 0)
+		return -1;
+
+	if (entering(tcp)) {
+		/* TODO: speed up strace by not doing this at every syscall.
+		 * We only need to do it after execve.
+		 */
+		int currpers;
+		long val;
+		int pid = tcp->pid;
+
+		/* Check CS register value. On x86-64 linux it is:
+		 *	0x33	for long mode (64 bit)
+		 *	0x23	for compatibility mode (32 bit)
+		 * It takes only one ptrace and thus doesn't need
+		 * to be cached.
+		 */
+		if (upeek(tcp, 8*CS, &val) < 0)
+			return -1;
+		switch (val) {
+			case 0x23: currpers = 1; break;
+			case 0x33: currpers = 0; break;
+			default:
+				fprintf(stderr, "Unknown value CS=0x%02X while "
+					 "detecting personality of process "
+					 "PID=%d\n", (int)val, pid);
+				currpers = current_personality;
+				break;
+		}
+#  if 0
+		/* This version analyzes the opcode of a syscall instruction.
+		 * (int 0x80 on i386 vs. syscall on x86-64)
+		 * It works, but is too complicated.
+		 */
+		unsigned long val, rip, i;
+
+		if (upeek(tcp, 8*RIP, &rip) < 0)
+			perror("upeek(RIP)");
+
+		/* sizeof(syscall) == sizeof(int 0x80) == 2 */
+		rip -= 2;
+		errno = 0;
+
+		call = ptrace(PTRACE_PEEKTEXT, pid, (char *)rip, (char *)0);
+		if (errno)
+			fprintf(stderr, "ptrace_peektext failed: %s\n",
+					strerror(errno));
+		switch (call & 0xffff) {
+			/* x86-64: syscall = 0x0f 0x05 */
+			case 0x050f: currpers = 0; break;
+			/* i386: int 0x80 = 0xcd 0x80 */
+			case 0x80cd: currpers = 1; break;
+			default:
+				currpers = current_personality;
+				fprintf(stderr,
+					"Unknown syscall opcode (0x%04X) while "
+					"detecting personality of process "
+					"PID=%d\n", (int)call, pid);
+				break;
+		}
+#  endif
+		if (currpers != current_personality) {
+			static const char *const names[] = {"64 bit", "32 bit"};
+			set_personality(currpers);
+			fprintf(stderr, "[ Process PID=%d runs in %s mode. ]\n",
+					pid, names[current_personality]);
+		}
+	}
+# elif defined(IA64)
+#	define IA64_PSR_IS	((long)1 << 34)
+	if (upeek(tcp, PT_CR_IPSR, &psr) >= 0)
+		ia32 = (psr & IA64_PSR_IS) != 0;
+	if (entering(tcp)) {
+		if (ia32) {
+			if (upeek(tcp, PT_R1, &scno) < 0)	/* orig eax */
+				return -1;
+		} else {
+			if (upeek(tcp, PT_R15, &scno) < 0)
+				return -1;
+		}
+		/* Check if this is the post-execve SIGTRAP. */
+		if (tcp->flags & TCB_WAITEXECVE) {
+			tcp->flags &= ~TCB_WAITEXECVE;
+			return 0;
+		}
+	} else {
+		/* Syscall exit */
+		if (upeek(tcp, PT_R8, &r8) < 0)
+			return -1;
+		if (upeek(tcp, PT_R10, &r10) < 0)
+			return -1;
+	}
+# elif defined (ARM)
+	/*
+	 * Read complete register set in one go.
+	 */
+	if (ptrace(PTRACE_GETREGS, tcp->pid, NULL, (void *)&regs) == -1)
+		return -1;
+
+	/*
+	 * We only need to grab the syscall number on syscall entry.
+	 */
+	if (regs.ARM_ip == 0) {
+		if (entering(tcp)) {
+			/* Check if this is the post-execve SIGTRAP. */
+			if (tcp->flags & TCB_WAITEXECVE) {
+				tcp->flags &= ~TCB_WAITEXECVE;
+				return 0;
+			}
+		}
+
+		/*
+		 * Note: we only deal with only 32-bit CPUs here.
+		 */
+		if (regs.ARM_cpsr & 0x20) {
+			/*
+			 * Get the Thumb-mode system call number
+			 */
+			scno = regs.ARM_r7;
+		} else {
+			/*
+			 * Get the ARM-mode system call number
+			 */
+			errno = 0;
+			scno = ptrace(PTRACE_PEEKTEXT, tcp->pid, (void *)(regs.ARM_pc - 4), NULL);
+			if (errno)
+				return -1;
+
+		/* FIXME: bogus check? it is already done on entering before,
+		 * so we never can see it here?
+		 */
+			if (scno == 0 && (tcp->flags & TCB_WAITEXECVE)) {
+				tcp->flags &= ~TCB_WAITEXECVE;
+				return 0;
+			}
+
+			/* Handle the EABI syscall convention.  We do not
+			   bother converting structures between the two
+			   ABIs, but basic functionality should work even
+			   if strace and the traced program have different
+			   ABIs.  */
+			if (scno == 0xef000000) {
+				scno = regs.ARM_r7;
+			} else {
+				if ((scno & 0x0ff00000) != 0x0f900000) {
+					fprintf(stderr, "syscall: unknown syscall trap 0x%08lx\n",
+						scno);
+					return -1;
+				}
+
+				/*
+				 * Fixup the syscall number
+				 */
+				scno &= 0x000fffff;
+			}
+		}
+		if (scno & 0x0f0000) {
+			/*
+			 * Handle ARM specific syscall
+			 */
+			set_personality(1);
+			scno &= 0x0000ffff;
+		} else
+			set_personality(0);
+
+		if (exiting(tcp)) {
+			fprintf(stderr, "pid %d stray syscall exit\n", tcp->pid);
+			tcp->flags &= ~TCB_INSYSCALL;
+		}
+	} else {
+		if (entering(tcp)) {
+			fprintf(stderr, "pid %d stray syscall entry\n", tcp->pid);
+			tcp->flags |= TCB_INSYSCALL;
+		}
+	}
+# elif defined (M68K)
+	if (upeek(tcp, 4*PT_ORIG_D0, &scno) < 0)
+		return -1;
+# elif defined (LINUX_MIPSN32)
+	unsigned long long regs[38];
+
+	if (ptrace(PTRACE_GETREGS, tcp->pid, NULL, (long) &regs) < 0)
+		return -1;
+	a3 = regs[REG_A3];
+	r2 = regs[REG_V0];
+
+	if (entering(tcp)) {
+		scno = r2;
+
+		/* Check if this is the post-execve SIGTRAP. */
+		if (scno == 0 && tcp->flags & TCB_WAITEXECVE) {
+			tcp->flags &= ~TCB_WAITEXECVE;
+			return 0;
+		}
+
+		if (scno < 0 || scno > nsyscalls) {
+			if (a3 == 0 || a3 == -1) {
+				if (debug)
+					fprintf(stderr, "stray syscall exit: v0 = %ld\n", scno);
+				return 0;
+			}
+		}
+	}
+# elif defined (MIPS)
+	if (upeek(tcp, REG_A3, &a3) < 0)
+		return -1;
+	if (entering(tcp)) {
+		if (upeek(tcp, REG_V0, &scno) < 0)
+			return -1;
+
+		/* Check if this is the post-execve SIGTRAP. */
+		if (scno == 0 && tcp->flags & TCB_WAITEXECVE) {
+			tcp->flags &= ~TCB_WAITEXECVE;
+			return 0;
+		}
+
+		if (scno < 0 || scno > nsyscalls) {
+			if (a3 == 0 || a3 == -1) {
+				if (debug)
+					fprintf(stderr, "stray syscall exit: v0 = %ld\n", scno);
+				return 0;
+			}
+		}
+	} else {
+		if (upeek(tcp, REG_V0, &r2) < 0)
+			return -1;
+	}
+# elif defined (ALPHA)
+	if (upeek(tcp, REG_A3, &a3) < 0)
+		return -1;
+
+	if (entering(tcp)) {
+		if (upeek(tcp, REG_R0, &scno) < 0)
+			return -1;
+
+		/* Check if this is the post-execve SIGTRAP. */
+		if (scno == 0 && tcp->flags & TCB_WAITEXECVE) {
+			tcp->flags &= ~TCB_WAITEXECVE;
+			return 0;
+		}
+
+		/*
+		 * Do some sanity checks to figure out if it's
+		 * really a syscall entry
+		 */
+		if (scno < 0 || scno > nsyscalls) {
+			if (a3 == 0 || a3 == -1) {
+				if (debug)
+					fprintf(stderr, "stray syscall exit: r0 = %ld\n", scno);
+				return 0;
+			}
+		}
+	}
+	else {
+		if (upeek(tcp, REG_R0, &r0) < 0)
+			return -1;
+	}
+# elif defined (SPARC) || defined (SPARC64)
+	/* Everything we need is in the current register set. */
+	if (ptrace(PTRACE_GETREGS, tcp->pid, (char *)&regs, 0) < 0)
+		return -1;
+
+	/* If we are entering, then disassemble the syscall trap. */
+	if (entering(tcp)) {
+		/* Retrieve the syscall trap instruction. */
+		errno = 0;
+#  if defined(SPARC64)
+		trap = ptrace(PTRACE_PEEKTEXT, tcp->pid, (char *)regs.tpc, 0);
+		trap >>= 32;
+#  else
+		trap = ptrace(PTRACE_PEEKTEXT, tcp->pid, (char *)regs.pc, 0);
+#  endif
+		if (errno)
+			return -1;
+
+		/* Disassemble the trap to see what personality to use. */
+		switch (trap) {
+		case 0x91d02010:
+			/* Linux/SPARC syscall trap. */
+			set_personality(0);
+			break;
+		case 0x91d0206d:
+			/* Linux/SPARC64 syscall trap. */
+			set_personality(2);
+			break;
+		case 0x91d02000:
+			/* SunOS syscall trap. (pers 1) */
+			fprintf(stderr, "syscall: SunOS no support\n");
+			return -1;
+		case 0x91d02008:
+			/* Solaris 2.x syscall trap. (per 2) */
+			set_personality(1);
+			break;
+		case 0x91d02009:
+			/* NetBSD/FreeBSD syscall trap. */
+			fprintf(stderr, "syscall: NetBSD/FreeBSD not supported\n");
+			return -1;
+		case 0x91d02027:
+			/* Solaris 2.x gettimeofday */
+			set_personality(1);
+			break;
+		default:
+			/* Check if this is the post-execve SIGTRAP. */
+			if (tcp->flags & TCB_WAITEXECVE) {
+				tcp->flags &= ~TCB_WAITEXECVE;
+				return 0;
+			}
+#  if defined (SPARC64)
+			fprintf(stderr, "syscall: unknown syscall trap %08lx %016lx\n", trap, regs.tpc);
+#  else
+			fprintf(stderr, "syscall: unknown syscall trap %08lx %08lx\n", trap, regs.pc);
+#  endif
+			return -1;
+		}
+
+		/* Extract the system call number from the registers. */
+		if (trap == 0x91d02027)
+			scno = 156;
+		else
+			scno = regs.u_regs[U_REG_G1];
+		if (scno == 0) {
+			scno = regs.u_regs[U_REG_O0];
+			memmove(&regs.u_regs[U_REG_O0], &regs.u_regs[U_REG_O1], 7*sizeof(regs.u_regs[0]));
+		}
+	}
+# elif defined(HPPA)
+	if (upeek(tcp, PT_GR20, &scno) < 0)
+		return -1;
+	if (entering(tcp)) {
+		/* Check if this is the post-execve SIGTRAP. */
+		if (tcp->flags & TCB_WAITEXECVE) {
+			tcp->flags &= ~TCB_WAITEXECVE;
+			return 0;
+		}
+	}
+# elif defined(SH)
+	/*
+	 * In the new syscall ABI, the system call number is in R3.
+	 */
+	if (upeek(tcp, 4*(REG_REG0+3), &scno) < 0)
+		return -1;
+
+	if (scno < 0) {
+		/* Odd as it may seem, a glibc bug has been known to cause
+		   glibc to issue bogus negative syscall numbers.  So for
+		   our purposes, make strace print what it *should* have been */
+		long correct_scno = (scno & 0xff);
+		if (debug)
+			fprintf(stderr,
+				"Detected glibc bug: bogus system call"
+				" number = %ld, correcting to %ld\n",
+				scno,
+				correct_scno);
+		scno = correct_scno;
+	}
+
+	if (entering(tcp)) {
+		/* Check if this is the post-execve SIGTRAP. */
+		if (scno == 0 && tcp->flags & TCB_WAITEXECVE) {
+			tcp->flags &= ~TCB_WAITEXECVE;
+			return 0;
+		}
+	}
+# elif defined(SH64)
+	if (upeek(tcp, REG_SYSCALL, &scno) < 0)
+		return -1;
+	scno &= 0xFFFF;
+
+	if (entering(tcp)) {
+		/* Check if this is the post-execve SIGTRAP. */
+		if (tcp->flags & TCB_WAITEXECVE) {
+			tcp->flags &= ~TCB_WAITEXECVE;
+			return 0;
+		}
+	}
+# elif defined(CRISV10) || defined(CRISV32)
+	if (upeek(tcp, 4*PT_R9, &scno) < 0)
+		return -1;
+# elif defined(TILE)
+	if (upeek(tcp, PTREGS_OFFSET_REG(10), &scno) < 0)
+		return -1;
+
+	if (entering(tcp)) {
+		/* Check if this is the post-execve SIGTRAP. */
+		if (tcp->flags & TCB_WAITEXECVE) {
+			tcp->flags &= ~TCB_WAITEXECVE;
+			return 0;
+		}
+	}
+# elif defined(MICROBLAZE)
+	if (upeek(tcp, 0, &scno) < 0)
+		return -1;
+# endif
+#endif /* LINUX */
+
+#ifdef SUNOS4
+	if (upeek(tcp, uoff(u_arg[7]), &scno) < 0)
+		return -1;
+#elif defined(SH)
+	/* new syscall ABI returns result in R0 */
+	if (upeek(tcp, 4*REG_REG0, (long *)&r0) < 0)
+		return -1;
+#elif defined(SH64)
+	/* ABI defines result returned in r9 */
+	if (upeek(tcp, REG_GENERAL(9), (long *)&r9) < 0)
+		return -1;
+#endif
+
+#ifdef USE_PROCFS
+# ifdef HAVE_PR_SYSCALL
+	scno = tcp->status.PR_SYSCALL;
+# else
+#  ifndef FREEBSD
+	scno = tcp->status.PR_WHAT;
+#  else
+	if (pread(tcp->pfd_reg, &regs, sizeof(regs), 0) < 0) {
+		perror("pread");
+		return -1;
+	}
+	switch (regs.r_eax) {
+	case SYS_syscall:
+	case SYS___syscall:
+		pread(tcp->pfd, &scno, sizeof(scno), regs.r_esp + sizeof(int));
+		break;
+	default:
+		scno = regs.r_eax;
+		break;
+	}
+#  endif /* FREEBSD */
+# endif /* !HAVE_PR_SYSCALL */
+#endif /* USE_PROCFS */
+
+	if (entering(tcp))
+		tcp->scno = scno;
+	return 1;
+}
 
 long
 known_scno(struct tcb *tcp)
@@ -2271,7 +2887,7 @@
 {
 	int res, scno_good;
 
-	scno_good = res = get_scno(tcp);
+	scno_good = res = get_scno_on_sysenter(tcp);
 	if (res == 0)
 		return res;
 	if (res == 1)
@@ -2443,7 +3059,7 @@
 	/* BTW, why we don't just memorize syscall no. on entry
 	 * in tcp->something?
 	 */
-	scno_good = res = get_scno(tcp);
+	scno_good = res = get_scno_on_sysexit(tcp);
 	if (res == 0)
 		return res;
 	if (res == 1)