Patch #510695: Add TSC profiling for the VM.
diff --git a/Doc/lib/libsys.tex b/Doc/lib/libsys.tex
index 3b81384..ceced16 100644
--- a/Doc/lib/libsys.tex
+++ b/Doc/lib/libsys.tex
@@ -475,6 +475,16 @@
   debugged.
 \end{funcdesc}
 
+\begin{funcdesc}{settscdump}{on_flag}
+  Activate dumping of VM measurements using the Pentium timestamp
+  counter, if \var{on_flag} is true. Deactivate these dumps if
+  \var{on_flag} is off. The function is available only if Python
+  was compiled with \code{--with-tsc}. To understand the output
+  of this dump, read \code{Python/ceval.c}.
+
+  \versionadded{2.4}
+\end{funcdesc}
+
 \begin{datadesc}{stdin}
 \dataline{stdout}
 \dataline{stderr}
diff --git a/Include/pystate.h b/Include/pystate.h
index 4239108..1ff2626 100644
--- a/Include/pystate.h
+++ b/Include/pystate.h
@@ -29,6 +29,9 @@
 #ifdef HAVE_DLOPEN
     int dlopenflags;
 #endif
+#ifdef WITH_TSC
+    int tscdump;
+#endif
 
 } PyInterpreterState;
 
diff --git a/Misc/NEWS b/Misc/NEWS
index cfd296e..a07d887 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -499,6 +499,9 @@
 Build
 -----
 
+- Profiling the VM using the Pentium TSC is now possible if Python
+  is configured --with-tsc.
+
 - In order to find libraries, setup.py now also looks in /lib64, for use
   on AMD64.
 
diff --git a/Python/ceval.c b/Python/ceval.c
index 2d60014..a52168c 100644
--- a/Python/ceval.c
+++ b/Python/ceval.c
@@ -17,6 +17,26 @@
 
 #include <ctype.h>
 
+#ifdef WITH_TSC
+#include <asm/msr.h>
+
+typedef unsigned long long uint64;
+
+void dump_tsc(int opcode, int ticked, uint64 inst0, uint64 inst1, 
+	      uint64 loop0, uint64 loop1, uint64 intr0, uint64 intr1)
+{
+	uint64 intr, inst, loop;
+	PyThreadState *tstate = PyThreadState_Get();
+	if (!tstate->interp->tscdump)
+		return;
+	intr = intr1 - intr0;
+	inst = inst1 - inst0 - intr;
+	loop = loop1 - loop0 - intr;
+	fprintf(stderr, "opcode=%03d t=%d inst=%06lld loop=%06lld\n",
+		opcode, ticked, inst, loop);
+}
+#endif
+
 /* Turn this on if your compiler chokes on the big switch: */
 /* #define CASE_TOO_BIG 1 */
 
@@ -30,7 +50,11 @@
 
 /* Forward declarations */
 static PyObject *eval_frame(PyFrameObject *);
+#ifdef WITH_TSC
+static PyObject *call_function(PyObject ***, int, uint64*, uint64*);
+#else
 static PyObject *call_function(PyObject ***, int);
+#endif
 static PyObject *fast_function(PyObject *, PyObject ***, int, int, int);
 static PyObject *do_call(PyObject *, PyObject ***, int, int);
 static PyObject *ext_do_call(PyObject *, PyObject ***, int, int, int);
@@ -485,6 +509,44 @@
 #define GETITEM(v, i) PyTuple_GetItem((v), (i))
 #endif
 
+#ifdef WITH_TSC
+/* Use Pentium timestamp counter to mark certain events:
+   inst0 -- beginning of switch statement for opcode dispatch
+   inst1 -- end of switch statement (may be skipped)
+   loop0 -- the top of the mainloop
+   loop1 -- place where control returns again to top of mainloop 
+            (may be skipped)
+   intr1 -- beginning of long interruption
+   intr2 -- end of long interruption
+
+   Many opcodes call out to helper C functions.  In some cases, the
+   time in those functions should be counted towards the time for the
+   opcode, but not in all cases.  For example, a CALL_FUNCTION opcode
+   calls another Python function; there's no point in charge all the
+   bytecode executed by the called function to the caller.
+
+   It's hard to make a useful judgement statically.  In the presence
+   of operator overloading, it's impossible to tell if a call will
+   execute new Python code or not.
+
+   It's a case-by-case judgement.  I'll use intr1 for the following
+   cases:
+
+   EXEC_STMT
+   IMPORT_STAR
+   IMPORT_FROM
+   CALL_FUNCTION (and friends)
+
+ */
+	uint64 inst0, inst1, loop0, loop1, intr0 = 0, intr1 = 0;
+	int ticked = 0;
+
+	rdtscll(inst0);
+	rdtscll(inst1);
+	rdtscll(loop0);
+	rdtscll(loop1);
+#endif
+
 /* Code access macros */
 
 #define INSTR_OFFSET()	(next_instr - first_instr)
@@ -643,6 +705,23 @@
 	w = NULL;
 
 	for (;;) {
+#ifdef WITH_TSC
+		if (inst1 == 0) {
+			/* Almost surely, the opcode executed a break
+			   or a continue, preventing inst1 from being set
+			   on the way out of the loop.
+			*/
+			rdtscll(inst1);
+			loop1 = inst1;
+		}
+		dump_tsc(opcode, ticked, inst0, inst1, loop0, loop1,
+			 intr0, intr1);
+		ticked = 0;
+		inst1 = 0;
+		intr0 = 0;
+		intr1 = 0;
+		rdtscll(loop0);
+#endif
 		assert(stack_pointer >= f->f_valuestack); /* else underflow */
 		assert(STACK_LEVEL() <= f->f_stacksize);  /* else overflow */
 
@@ -662,6 +741,9 @@
                         }
 			_Py_Ticker = _Py_CheckInterval;
 			tstate->tick_counter++;
+#ifdef WITH_TSC
+			ticked = 1;
+#endif
 			if (things_to_do) {
 				if (Py_MakePendingCalls() < 0) {
 					why = WHY_EXCEPTION;
@@ -752,6 +834,9 @@
 #endif
 
 		/* Main switch on opcode */
+#ifdef WITH_TSC
+		rdtscll(inst0);
+#endif
 
 		switch (opcode) {
 
@@ -1493,7 +1578,13 @@
 			v = SECOND();
 			u = THIRD();
 			STACKADJ(-3);
+#ifdef WITH_TSC
+			rdtscll(intr0);
+#endif
 			err = exec_statement(f, u, v, w);
+#ifdef WITH_TSC
+			rdtscll(intr1);
+#endif
 			Py_DECREF(u);
 			Py_DECREF(v);
 			Py_DECREF(w);
@@ -1855,7 +1946,13 @@
 				x = NULL;
 				break;
 			}
+#ifdef WITH_TSC
+			rdtscll(intr0);
+#endif
 			x = PyEval_CallObject(x, w);
+#ifdef WITH_TSC
+			rdtscll(intr1);
+#endif
 			Py_DECREF(w);
 			SET_TOP(x);
 			if (x != NULL) continue;
@@ -1869,7 +1966,13 @@
 					"no locals found during 'import *'");
 				break;
 			}
+#ifdef WITH_TSC
+			rdtscll(intr0);
+#endif
 			err = import_all_from(x, v);
+#ifdef WITH_TSC
+			rdtscll(intr1);
+#endif
 			PyFrame_LocalsToFast(f, 0);
 			Py_DECREF(v);
 			if (err == 0) continue;
@@ -1878,7 +1981,13 @@
 		case IMPORT_FROM:
 			w = GETITEM(names, oparg);
 			v = TOP();
+#ifdef WITH_TSC
+			rdtscll(intr0);
+#endif
 			x = import_from(v, w);
+#ifdef WITH_TSC
+			rdtscll(intr1);
+#endif
 			PUSH(x);
 			if (x != NULL) continue;
 			break;
@@ -1987,7 +2096,11 @@
 
 		case CALL_FUNCTION:
 			PCALL(PCALL_ALL);
+#ifdef WITH_TSC
+			x = call_function(&stack_pointer, oparg, &intr0, &intr1);
+#else
 			x = call_function(&stack_pointer, oparg);
+#endif
 			PUSH(x);
 			if (x != NULL)
 				continue;
@@ -2022,7 +2135,13 @@
 			    n++;
 		    } else
 			    Py_INCREF(func);
+#ifdef WITH_TSC
+		    rdtscll(intr0);
+#endif
 		    x = ext_do_call(func, &stack_pointer, flags, na, nk);
+#ifdef WITH_TSC
+		    rdtscll(intr1);
+#endif
 		    Py_DECREF(func);
 
 		    while (stack_pointer > pfunc) {
@@ -2134,6 +2253,10 @@
 
 	    on_error:
 
+#ifdef WITH_TSC
+		rdtscll(inst1);
+#endif
+
 		/* Quickly continue if no error occurred */
 
 		if (why == WHY_NOT) {
@@ -2143,9 +2266,15 @@
 				if (PyErr_Occurred())
 					fprintf(stderr,
 						"XXX undetected error\n");
-				else
+				else {
+#endif
+#ifdef WITH_TSC
+					rdtscll(loop1);
 #endif
 					continue; /* Normal, fast path */
+#ifdef CHECKEXC
+				}
+#endif
 			}
 			why = WHY_EXCEPTION;
 			x = Py_None;
@@ -2260,6 +2389,9 @@
 
 		if (why != WHY_NOT)
 			break;
+#ifdef WITH_TSC
+		rdtscll(loop1);
+#endif
 
 	} /* main loop */
 
@@ -3331,7 +3463,11 @@
 
 
 static PyObject *
-call_function(PyObject ***pp_stack, int oparg)
+call_function(PyObject ***pp_stack, int oparg
+#ifdef WITH_TSC
+		, uint64* pintr0, uint64* pintr1
+#endif
+		)
 {
 	int na = oparg & 0xff;
 	int nk = (oparg>>8) & 0xff;
@@ -3374,7 +3510,13 @@
 			PyObject *callargs;
 			callargs = load_args(pp_stack, na);
 			BEGIN_C_TRACE
+#ifdef WITH_TSC
+			rdtscll(*pintr0);
+#endif
 			x = PyCFunction_Call(func, callargs, NULL);
+#ifdef WITH_TSC
+			rdtscll(*pintr1);
+#endif
 			END_C_TRACE
 			Py_XDECREF(callargs);
 		}
@@ -3393,10 +3535,16 @@
 			n++;
 		} else
 			Py_INCREF(func);
+#ifdef WITH_TSC
+		rdtscll(*pintr0);
+#endif
 		if (PyFunction_Check(func))
 			x = fast_function(func, pp_stack, n, na, nk);
 		else
 			x = do_call(func, pp_stack, na, nk);
+#ifdef WITH_TSC
+		rdtscll(*pintr1);
+#endif
 		Py_DECREF(func);
 	}
 
diff --git a/Python/pystate.c b/Python/pystate.c
index b0eb4c4..6a8beb9 100644
--- a/Python/pystate.c
+++ b/Python/pystate.c
@@ -59,6 +59,9 @@
 		interp->dlopenflags = RTLD_LAZY;
 #endif
 #endif
+#ifdef WITH_TSC
+		interp->tscdump = 0;
+#endif
 
 		HEAD_LOCK();
 		interp->next = interp_head;
diff --git a/Python/sysmodule.c b/Python/sysmodule.c
index 09e411e..55c0425 100644
--- a/Python/sysmodule.c
+++ b/Python/sysmodule.c
@@ -442,6 +442,33 @@
 "getcheckinterval() -> current check interval; see setcheckinterval()."
 );
 
+#ifdef WITH_TSC
+static PyObject *
+sys_settscdump(PyObject *self, PyObject *args)
+{
+	int bool;
+	PyThreadState *tstate = PyThreadState_Get();
+
+	if (!PyArg_ParseTuple(args, "i:settscdump", &bool))
+		return NULL;
+	if (bool)
+		tstate->interp->tscdump = 1;
+	else
+		tstate->interp->tscdump = 0;
+	Py_INCREF(Py_None);
+	return Py_None;
+	
+}
+
+PyDoc_STRVAR(settscdump_doc, 
+"settscdump(bool)\n\
+\n\
+If true, tell the Python interpreter to dump VM measurements to\n\
+stderr.  If false, turn off dump.  The measurements are based on the\n\
+Pentium time-stamp counter."
+); 
+#endif TSC
+
 static PyObject *
 sys_setrecursionlimit(PyObject *self, PyObject *args)
 {
@@ -743,6 +770,9 @@
 	{"setprofile",	sys_setprofile, METH_O, setprofile_doc},
 	{"setrecursionlimit", sys_setrecursionlimit, METH_VARARGS,
 	 setrecursionlimit_doc},
+#ifdef WITH_TSC
+	{"settscdump", sys_settscdump, METH_VARARGS, settscdump_doc},
+#endif
 	{"settrace",	sys_settrace, METH_O, settrace_doc},
 	{"call_tracing", sys_call_tracing, METH_VARARGS, call_tracing_doc},
 	{NULL,		NULL}		/* sentinel */
diff --git a/README b/README
index 9997ea2..f020d3a 100644
--- a/README
+++ b/README
@@ -1026,6 +1026,7 @@
 	in files. Python code can open a file with open(file, 'U') to
 	read it in universal newline mode. THIS OPTION IS UNSUPPORTED.
 
+--with-tsc: Profile using the Pentium timestamping counter (TSC).
 
 Building for multiple architectures (using the VPATH feature)
 -------------------------------------------------------------
diff --git a/configure b/configure
index 5619b2c..697f575 100755
--- a/configure
+++ b/configure
@@ -1,5 +1,5 @@
 #! /bin/sh
-# From configure.in Revision: 1.455 .
+# From configure.in Revision: 1.456 .
 # Guess values for system-dependent variables and create Makefiles.
 # Generated by GNU Autoconf 2.57 for python 2.4.
 #
@@ -867,6 +867,7 @@
                           deprecated; use --with(out)-threads
   --with-pth              use GNU pth threading libraries
   --with(out)-doc-strings disable/enable documentation strings
+  --with(out)-tsc         enable/disable timestamp counter profile
   --with(out)-pymalloc    disable/enable specialized mallocs
   --with-wctype-functions use wctype.h functions
   --with-fpectl           enable SIGFPE catching
@@ -12006,6 +12007,31 @@
 echo "${ECHO_T}$with_doc_strings" >&6
 
 # Check for Python-specific malloc support
+echo "$as_me:$LINENO: checking for --with-tsc" >&5
+echo $ECHO_N "checking for --with-tsc... $ECHO_C" >&6
+
+# Check whether --with-tsc or --without-tsc was given.
+if test "${with_tsc+set}" = set; then
+  withval="$with_tsc"
+
+if test "$withval" != no
+then
+
+cat >>confdefs.h <<\_ACEOF
+#define WITH_TSC 1
+_ACEOF
+
+    echo "$as_me:$LINENO: result: yes" >&5
+echo "${ECHO_T}yes" >&6
+else echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6
+fi
+else
+  echo "$as_me:$LINENO: result: no" >&5
+echo "${ECHO_T}no" >&6
+fi;
+
+# Check for Python-specific malloc support
 echo "$as_me:$LINENO: checking for --with-pymalloc" >&5
 echo $ECHO_N "checking for --with-pymalloc... $ECHO_C" >&6
 
diff --git a/configure.in b/configure.in
index cd84ea9..cff0ed3 100644
--- a/configure.in
+++ b/configure.in
@@ -1929,6 +1929,19 @@
 AC_MSG_RESULT($with_doc_strings)
 
 # Check for Python-specific malloc support
+AC_MSG_CHECKING(for --with-tsc)
+AC_ARG_WITH(tsc,
+[  --with(out)-tsc         enable/disable timestamp counter profile], [
+if test "$withval" != no
+then 
+  AC_DEFINE(WITH_TSC, 1, 
+    [Define to profile with the Pentium timestamp counter]) 
+    AC_MSG_RESULT(yes)
+else AC_MSG_RESULT(no)
+fi],
+[AC_MSG_RESULT(no)])
+
+# Check for Python-specific malloc support
 AC_MSG_CHECKING(for --with-pymalloc)
 AC_ARG_WITH(pymalloc,
             AC_HELP_STRING(--with(out)-pymalloc, disable/enable specialized mallocs))
diff --git a/pyconfig.h.in b/pyconfig.h.in
index fd4a124..8cb0cff 100644
--- a/pyconfig.h.in
+++ b/pyconfig.h.in
@@ -780,6 +780,9 @@
 /* Define if you want to compile in rudimentary thread support */
 #undef WITH_THREAD
 
+/* Define to profile with the Pentium timestamp counter */
+#undef WITH_TSC
+
 /* Define to 1 if your processor stores words with the most significant byte
    first (like Motorola and SPARC, unlike Intel and VAX). */
 #undef WORDS_BIGENDIAN