Fix enough stuff so it will start up and run for a few bbs on amd64,
before dying.


git-svn-id: svn://svn.valgrind.org/valgrind/trunk@3230 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/coregrind/amd64-linux/ldt.c b/coregrind/amd64-linux/ldt.c
index d8bfc08..3c19585 100644
--- a/coregrind/amd64-linux/ldt.c
+++ b/coregrind/amd64-linux/ldt.c
@@ -129,7 +129,6 @@
    if (ldt != NULL)
       VG_(arena_free)(VG_AR_CORE, ldt);
 }
-#endif
 
 
 /* Clear a TLS array. */
@@ -149,7 +148,6 @@
 }
 
 
-#if 0
 /* Fish the base field out of an VgLdtEntry.  This is the only part we
    are particularly interested in. */
 
diff --git a/coregrind/amd64-linux/syscalls.c b/coregrind/amd64-linux/syscalls.c
index 8ad091d..7eb8789 100644
--- a/coregrind/amd64-linux/syscalls.c
+++ b/coregrind/amd64-linux/syscalls.c
@@ -412,9 +412,9 @@
 const UInt VGA_(syscall_table_size) = 
             sizeof(VGA_(syscall_table)) / sizeof(VGA_(syscall_table)[0]);
 
-void        VG_(clear_TLS_for_thread)      ( VgLdtEntry* tls )
-{
-}
+//void        VG_(clear_TLS_for_thread)      ( VgLdtEntry* tls )
+//{
+//}
 
 /*--------------------------------------------------------------------*/
 /*--- end                                                          ---*/
diff --git a/coregrind/amd64/amd64_private.h b/coregrind/amd64/amd64_private.h
index 58e1b6d..be9f4fd 100644
--- a/coregrind/amd64/amd64_private.h
+++ b/coregrind/amd64/amd64_private.h
@@ -36,15 +36,9 @@
 #include "tool_arch.h"           // arch-specific tool stuff
 
 /* ---------------------------------------------------------------------
-   Exports of vg_ldt.c
+   Exports of state.c that are not core-visible
    ------------------------------------------------------------------ */
 
-#if 0
-/* Alloc & copy, and dealloc. */
-extern VgLdtEntry* VG_(allocate_LDT_for_thread)   ( VgLdtEntry* parent_ldt );
-extern void        VG_(deallocate_LDT_for_thread) ( VgLdtEntry* ldt );
-#endif
-extern void        VG_(clear_TLS_for_thread)      ( VgLdtEntry* tls );
 
 #endif   // __AMD64_PRIVATE_H
 
diff --git a/coregrind/amd64/core_arch.h b/coregrind/amd64/core_arch.h
index 6272190..c708449 100644
--- a/coregrind/amd64/core_arch.h
+++ b/coregrind/amd64/core_arch.h
@@ -44,8 +44,6 @@
 #define VG_ELF_MACHINE        EM_X86_64
 #define VG_ELF_CLASS          ELFCLASS64
 
-#define InsnSetArch           InsnSetAMD64
-
 #define VGA_WORD_SIZE         8
 
 /* ---------------------------------------------------------------------
@@ -71,69 +69,23 @@
 #define STACK_FRAME_NEXT(rbp)          (((UWord*)rbp)[0])
 
 // Get stack pointer and frame pointer
-#define ARCH_GET_REAL_STACK_PTR(esp) do {   \
-   I_die_here; \
+#define ARCH_GET_REAL_STACK_PTR(lval) do {   \
+   asm("movq %%rsp, %0" : "=r" (lval));      \
 } while (0)
 
-#define ARCH_GET_REAL_FRAME_PTR(ebp) do {   \
-   I_die_here; \
+#define ARCH_GET_REAL_FRAME_PTR(lval) do {   \
+   asm("movq %%rbp, %0" : "=r" (lval));      \
 } while (0)
 
 /* ---------------------------------------------------------------------
-   LDT type             
-   ------------------------------------------------------------------ */
-
-// XXX: eventually this will be x86-private, not seen by the core(?)
-
-/* This is the hardware-format for a segment descriptor, ie what the
-   x86 actually deals with.  It is 8 bytes long.  It's ugly.  */
-
-typedef struct _LDT_ENTRY {
-    union {
-       struct {
-          UShort      LimitLow;
-          UShort      BaseLow;
-          unsigned    BaseMid         : 8;
-          unsigned    Type            : 5;
-          unsigned    Dpl             : 2;
-          unsigned    Pres            : 1;
-          unsigned    LimitHi         : 4;
-          unsigned    Sys             : 1;
-          unsigned    Reserved_0      : 1;
-          unsigned    Default_Big     : 1;
-          unsigned    Granularity     : 1;
-          unsigned    BaseHi          : 8;
-       } Bits;
-       struct {
-          UInt word1;
-          UInt word2;
-       } Words;
-    } 
-    LdtEnt;
-} VgLdtEntry;
-
-/* ---------------------------------------------------------------------
    Architecture-specific part of a ThreadState
    ------------------------------------------------------------------ */
 
 // Architecture-specific part of a ThreadState
 // XXX: eventually this should be made abstract, ie. the fields not visible
-//      to the core...  then VgLdtEntry can be made non-visible to the core
-//      also.
+//      to the core...
 typedef 
    struct {
-      /* Pointer to this thread's Local (Segment) Descriptor Table.
-         Starts out as NULL, indicating there is no table, and we hope
-         to keep it that way.  If the thread does __NR_modify_ldt to
-         create entries, we allocate a 8192-entry table at that point.
-         This is a straight copy of the Linux kernel's scheme.  Don't
-         forget to deallocate this at thread exit. */
-      VgLdtEntry* ldt;
-
-      /* TLS table. This consists of a small number (currently 3) of
-         entries from the Global Descriptor Table. */
-      VgLdtEntry tls[VKI_GDT_ENTRY_TLS_ENTRIES];
-
       /* --- BEGIN vex-mandated guest state --- */
 
       /* Saved machine context. */
diff --git a/coregrind/amd64/dispatch.S b/coregrind/amd64/dispatch.S
index a0b3be9..8c9ad28 100644
--- a/coregrind/amd64/dispatch.S
+++ b/coregrind/amd64/dispatch.S
@@ -35,13 +35,6 @@
 /*------------------------------------------------------------*/
 /*--- The dispatch loop.                                   ---*/
 /*------------------------------------------------------------*/
-
-#define TT_LOOKUP(reg, fail)                            \
-        movq %rax, reg;                                 \
-        andq $VG_TT_FAST_MASK, reg;                     \
-        movq VG_(tt_fast)(,reg,8), reg;                 \
-        cmpq %rax, (reg);                               \
-        jnz  fail
 	
 .globl VG_(run_innerloop)
 VG_(run_innerloop):
@@ -71,7 +64,24 @@
 	/* fetch %RIP into %rax */
 	movq	VG_(instr_ptr_offset), %rsi
 	movq	(%rbp, %rsi, 1), %rax
+
+	/* set host FPU control word to the default mode expected 
+           by VEX-generated code.  See comments in libvex.h for
+           more info. */
+	finit
+	pushq	$0x027F
+	fldcw	(%rsp)
+	addq	$8, %rsp
 	
+	/* set host SSE control word to the default mode expected 
+	   by VEX-generated code. */
+	pushq	$0x1F80
+	ldmxcsr	(%rsp)
+	addq	$8, %rsp
+
+	/* set dir flag to known value */
+	cld
+
 	/* fall into main loop */
 
 	/* Here, %rax is the only live (real) register.  The entire
@@ -84,16 +94,23 @@
 
 	/* Are we out of timeslice?  If yes, defer to scheduler. */
 	subl $1, VG_(dispatch_ctr)
-	
 	jz	counter_is_zero
-	/* try a fast lookup in the translation cache */
-	TT_LOOKUP(%rbx, fast_lookup_failed)
 
-	/* Found a match.  Call the tce.payload field.  The magic 12
-	   value is offsetof(TCEntry,payload) on a 64-bit platform. */
-	addq	$12, %rbx
-	call 	*%rbx
-	
+	/* try a fast lookup in the translation cache */
+	movq %rax, %rbx
+	andq $VG_TT_FAST_MASK, %rbx
+	movq VG_(tt_fast)(,%rbx,8), %rcx
+	cmpq %rax, (%rcx)
+	jnz  fast_lookup_failed
+	/* increment bb profile counter */
+	movq VG_(tt_fastN)(,%rbx,8), %rdx
+	incl (%rdx)
+
+	/* Found a match.  Call tce[1], which is 8 bytes along, since
+           each tce element is a 64-bit int. */
+	addq 	$8, %rcx
+	call 	*%rcx
+
 	/* 
 	   %rax holds destination (original) address.
 	   %rbp indicates further details of the control transfer
@@ -110,20 +127,34 @@
 
 	jmp	dispatch_exceptional
 
-	
-fast_lookup_failed:
-	/* %RIP is up to date here since dispatch_boring dominates */
-	addl	$1, VG_(dispatch_ctr)
-	movq	$VG_TRC_INNER_FASTMISS, %rax
-	jmp	run_innerloop_exit
 
-counter_is_zero:
-	/* %RIP is up to date here since dispatch_boring dominates */
-	addl	$1, VG_(dispatch_ctr)
-	movq	$VG_TRC_INNER_COUNTERZERO, %rax
-	jmp	run_innerloop_exit
-	
-run_innerloop_exit:
+
+/* All exits from the dispatcher go through here.  %rax holds
+   the return value. 
+*/
+run_innerloop_exit: 
+	/* We're leaving.  Check that nobody messed with
+           %mxcsr or %fpucw.  We can't mess with %rax here as it
+	   holds the tentative return value, but any other is OK. */
+	pushq	$0
+	fstcw	(%rsp)
+	cmpl	$0x027F, (%rsp)
+	popq	%r11 /* get rid of the word without trashing %eflags */
+	jnz	invariant_violation
+	pushq	$0
+	stmxcsr	(%rsp)
+	andl	$0xFFFFFFC0, (%rsp)  /* mask out status flags */
+	cmpl	$0x1F80, (%rsp)
+	popq	%r11
+	jnz	invariant_violation
+	/* otherwise we're OK */
+	jmp	run_innerloop_exit_REALLY
+
+invariant_violation:
+	movq	$VG_TRC_INVARIANT_FAILED, %rax
+	jmp	run_innerloop_exit_REALLY
+
+run_innerloop_exit_REALLY:
 	popq	%rdi
 	popq	%r15
 	popq	%r14
@@ -140,8 +171,8 @@
 	popq	%rbx
 	ret	
 
-
-
+	
+	
 /* Other ways of getting out of the inner loop.  Placed out-of-line to
    make it look cleaner. 
 */
@@ -150,13 +181,26 @@
 	cmpq	$VG_TRC_INNER_COUNTERZERO, %rbp
 	jz	counter_is_zero
 
-	/* save %rax in %RIP and defer to sched */
+	/* save %eax in %EIP and defer to sched */
 	movq	VG_(instr_ptr_offset), %rsi
 	movq	0(%rsp), %rdi
 	movq	%rax, (%rdi, %rsi, 1)
 	movq	%rbp, %rax
 	jmp	run_innerloop_exit
 
+fast_lookup_failed:
+	/* %RIP is up to date here since dispatch_boring dominates */
+	addl	$1, VG_(dispatch_ctr)
+	movq	$VG_TRC_INNER_FASTMISS, %rax
+	jmp	run_innerloop_exit
+
+counter_is_zero:
+	/* %RIP is up to date here since dispatch_boring dominates */
+	addl	$1, VG_(dispatch_ctr)
+	movq	$VG_TRC_INNER_COUNTERZERO, %rax
+	jmp	run_innerloop_exit
+
+
 /* Let the linker know we don't need an executable stack */
 .section .note.GNU-stack,"",@progbits
 
diff --git a/coregrind/amd64/state.c b/coregrind/amd64/state.c
index 60ee569..3dc1805 100644
--- a/coregrind/amd64/state.c
+++ b/coregrind/amd64/state.c
@@ -98,25 +98,12 @@
 /*--- Thread stuff                                         ---*/
 /*------------------------------------------------------------*/
 
-void VGA_(clear_thread)( ThreadArchState *arch )
-{
-   arch->ldt = NULL;
-   VG_(clear_TLS_for_thread)(arch->tls);
-}  
-
 void VGA_(cleanup_thread) ( ThreadArchState *arch )
 {  
-   I_die_here;
-#if 0
-   /* Deallocate its LDT, if it ever had one. */
-   VG_(deallocate_LDT_for_thread)( arch->ldt ); 
-   arch->ldt = NULL;
-   
-   /* Clear its TLS array. */
-   VG_(clear_TLS_for_thread)( arch->tls );
-#endif
+   /* TODO: deallocate the thread's LDT / GDT ? */
 }  
 
+
 void VGA_(setup_child) ( ThreadArchState *arch, ThreadArchState *parent_arch )
 {  
    I_die_here;
diff --git a/coregrind/arm/core_arch.h b/coregrind/arm/core_arch.h
index bb23f72..2ad94ae 100644
--- a/coregrind/arm/core_arch.h
+++ b/coregrind/arm/core_arch.h
@@ -45,8 +45,6 @@
 #define VG_ELF_MACHINE        EM_ARM
 #define VG_ELF_CLASS          ELFCLASS32
 
-#define InsnSetArch           InsnSetARM
-
 #define VGA_WORD_SIZE         4
 
 /* ---------------------------------------------------------------------
diff --git a/coregrind/vg_main.c b/coregrind/vg_main.c
index 2cc0231..1c9ad5a 100644
--- a/coregrind/vg_main.c
+++ b/coregrind/vg_main.c
@@ -1129,7 +1129,7 @@
 
    /* We know the initial ESP is pointing at argc/argv */
    VG_(client_argc) = *(Int*)cl_esp;
-   VG_(client_argv) = (Char**)(cl_esp + sizeof(Int));
+   VG_(client_argv) = (Char**)(cl_esp + sizeof(HWord));
 
    return cl_esp;
 }
diff --git a/coregrind/vg_proxylwp.c b/coregrind/vg_proxylwp.c
index dd236f3..fd7bbdb 100644
--- a/coregrind/vg_proxylwp.c
+++ b/coregrind/vg_proxylwp.c
@@ -868,10 +868,10 @@
       if (block) {
 	 Int lwp = proxy->lwp;
 
-
+VG_(printf)("OINK 503\n");
 	 if(proxy->lwp != 0)
 	    do_futex(&proxy->lwp, VKI_FUTEX_WAIT, lwp, NULL, NULL);
-
+VG_(printf)("OINK 504\n");
 	 if (status)
 	    *status = proxy->exitcode;
 	 ret = True;
@@ -884,7 +884,7 @@
    } else {
       Int flags = __VKI_WCLONE;
       Int res;
-
+VG_(printf)("OINK 506\n");
       if (!block)
 	 flags |= VKI_WNOHANG;
       res = VG_(waitpid)(proxy->lwp, status, flags);
@@ -961,14 +961,14 @@
    vg_assert(proxy->tid == tid);
    if (proxy->terminating)
       return;		/* already going away */
-
+VG_(printf)("OINK 401\n");
    proxy->terminating = True;
-
+VG_(printf)("OINK 402\n");
    VG_(close)(proxy->topx);
    proxy->topx = -1;
-
+VG_(printf)("OINK 403\n");
    /* proxy thread will close proxy->frommain itself */
-
+VG_(printf)("OINK 404\n");
    if (force && lwp != 0) {
       /* wouldn't need to force it if it were already dead */
       vg_assert(tst->status != VgTs_Empty);
@@ -979,19 +979,21 @@
 
    status = -1;
    res = False;
-
+VG_(printf)("OINK 405\n");
    /* We need to wait for the PX_Exiting message before doing the
       proxy_wait, because if we don't read the results pipe, the proxy
       may be blocked writing to it, causing a deadlock with us as we
       wait for it to exit. */
    sys_wait_results(True, tid, PX_Exiting, True);
+VG_(printf)("OINK 405a\n");
    res = proxy_wait(proxy, True, &status);
-
+VG_(printf)("OINK 406\n");
    if ((!res || status != 0) && VG_(clo_verbosity) > 1)
       VG_(printf)("proxy %d for tid %d exited status %d, res %d\n",
 		  lwp, tid, status, res);
-
+VG_(printf)("OINK 407\n");
    LWP_free(proxy);
+VG_(printf)("OINK 408\n");
    tst->proxy = NULL;
 }
 
diff --git a/coregrind/vg_scheduler.c b/coregrind/vg_scheduler.c
index 89a17ec..1f130b2 100644
--- a/coregrind/vg_scheduler.c
+++ b/coregrind/vg_scheduler.c
@@ -208,6 +208,7 @@
       case VEX_TRC_JMP_SYSCALL:       return "SYSCALL";
       case VEX_TRC_JMP_CLIENTREQ:     return "CLIENTREQ";
       case VEX_TRC_JMP_YIELD:         return "YIELD";
+      case VEX_TRC_JMP_NODECODE:      return "NODECODE";
       case VG_TRC_INNER_COUNTERZERO:  return "COUNTERZERO";
       case VG_TRC_INNER_FASTMISS:     return "FASTMISS";
       case VG_TRC_UNRESUMABLE_SIGNAL: return "FATALSIGNAL";
@@ -317,6 +318,10 @@
    vg_assert(sz_spill == LibVEX_N_SPILL_BYTES);
    vg_assert(a_vex + 2 * sz_vex == a_spill);
 
+   vg_assert(VG_(instr_ptr_offset) >= 0);
+   vg_assert(VG_(instr_ptr_offset) <= 10000); /* let's say */
+   vg_assert(sizeof VG_(instr_ptr_offset) == sizeof(HWord));
+
    VGP_PUSHCC(VgpRun);
 
    /* there should be no undealt-with signals */
@@ -1075,7 +1080,7 @@
          VG_(message)(Vg_DebugMsg, "thread %d:   completed %d bbs, trc %d", 
                                    tid, done_this_time, (Int)trc );
 
-      if (0 && trc != VG_TRC_INNER_FASTMISS)
+      if (1 && trc != VG_TRC_INNER_FASTMISS)
          VG_(message)(Vg_DebugMsg, "thread %d:  %llu bbs, event %s", 
                                    tid, VG_(bbs_done),
                                    name_of_sched_event(trc) );
@@ -1312,23 +1317,24 @@
 void cleanup_after_thread_exited ( ThreadId tid, Bool forcekill )
 {
    Segment *seg;
-
+VG_(printf)("OINK 40\n");
    vg_assert(is_valid_or_empty_tid(tid));
    vg_assert(VG_(threads)[tid].status == VgTs_Empty);
-
+VG_(printf)("OINK 41\n");
    /* Its stack is now off-limits */
    if (VG_(threads)[tid].stack_base) {
       seg = VG_(find_segment)( VG_(threads)[tid].stack_base );
       VG_TRACK( die_mem_stack, seg->addr, seg->len );
    }
-
+VG_(printf)("OINK 42\n");
    VGA_(cleanup_thread)( &VG_(threads)[tid].arch );
-
+VG_(printf)("OINK 43\n");
    /* Not interested in the timeout anymore */
    VG_(threads)[tid].awaken_at = 0xFFFFFFFF;
-
+VG_(printf)("OINK 44\n");
    /* Delete proxy LWP */
    VG_(proxy_delete)(tid, forcekill);
+VG_(printf)("OINK 45\n");
 }
 
 
@@ -1404,21 +1410,27 @@
 void VG_(nuke_all_threads_except) ( ThreadId me )
 {
    ThreadId tid;
+   VG_(printf)("HACK HACK HACK: nuke_all_threads_except\n"); return;
+
    for (tid = 1; tid < VG_N_THREADS; tid++) {
       if (tid == me
           || VG_(threads)[tid].status == VgTs_Empty)
          continue;
-      if (0)
+      if (1)
          VG_(printf)(
             "VG_(nuke_all_threads_except): nuking tid %d\n", tid);
+VG_(printf)("OINK 49\n");
       VG_(proxy_delete)(tid, True);
+VG_(printf)("OINK 49a\n");
       VG_(threads)[tid].status = VgTs_Empty;
       VG_(threads)[tid].associated_mx = NULL;
       VG_(threads)[tid].associated_cv = NULL;
       VG_(threads)[tid].stack_base = (Addr)NULL;
       VG_(threads)[tid].stack_size = 0;
       cleanup_after_thread_exited( tid, True );
+VG_(printf)("OINK 4\n");
    }
+VG_(printf)("OINK 5\n");
 }
 
 
diff --git a/coregrind/x86/dispatch.S b/coregrind/x86/dispatch.S
index 19489cc..f91d117 100644
--- a/coregrind/x86/dispatch.S
+++ b/coregrind/x86/dispatch.S
@@ -89,20 +89,20 @@
 
 	/* Are we out of timeslice?  If yes, defer to scheduler. */
 	subl $1, VG_(dispatch_ctr)
-	
 	jz	counter_is_zero
+
 	/* try a fast lookup in the translation cache */
 	movl %eax, %ebx
 	andl $VG_TT_FAST_MASK, %ebx
 	movl VG_(tt_fast)(,%ebx,4), %ecx
 	cmpl %eax, (%ecx)
 	jnz  fast_lookup_failed
+	/* increment bb profile counter */
 	movl VG_(tt_fastN)(,%ebx,4), %edx
 	incl (%edx)
 
 	/* Found a match.  Call tce[1], which is 8 bytes along, since
            each tce element is a 64-bit int. */
-	
 	addl 	$8, %ecx
 	call 	*%ecx