Implement a few SSE instructions, enough to run bzip2 when compiled
with the Intel C compiler (7.1.009) with vectorisation for SSE engaged
(-xK).


git-svn-id: svn://svn.valgrind.org/valgrind/trunk@1651 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/coregrind/vg_from_ucode.c b/coregrind/vg_from_ucode.c
index 2236a47..9b8fb84 100644
--- a/coregrind/vg_from_ucode.c
+++ b/coregrind/vg_from_ucode.c
@@ -1419,6 +1419,86 @@
                   nameIReg(4,ireg) );
 }
 
+static void emit_SSE2a ( FlagSet uses_sflags, 
+                         FlagSet sets_sflags,
+                         UChar first_byte, 
+                         UChar second_byte, 
+			 UChar third_byte,
+                         Int ireg )
+{
+   VG_(new_emit)(True, uses_sflags, sets_sflags);
+   VG_(emitB) ( first_byte );
+   VG_(emitB) ( second_byte );
+   third_byte &= 0x38; /* mask out mod and rm fields */
+   emit_amode_regmem_reg ( ireg, third_byte >> 3 );
+   if (dis)
+      VG_(printf)("\n\t\tsse-0x%x:0x%x:0x%x-(%s)\n", 
+                  (UInt)first_byte, (UInt)second_byte, (UInt)third_byte,
+                  nameIReg(4,ireg) );
+}
+
+static void emit_SSE3a ( FlagSet uses_sflags, 
+                         FlagSet sets_sflags,
+                         UChar first_byte, 
+                         UChar second_byte, 
+			 UChar third_byte,
+			 UChar fourth_byte,
+                         Int ireg )
+{
+   VG_(new_emit)(True, uses_sflags, sets_sflags);
+   VG_(emitB) ( first_byte );
+   VG_(emitB) ( second_byte );
+   VG_(emitB) ( third_byte );
+   fourth_byte &= 0x38; /* mask out mod and rm fields */
+   emit_amode_regmem_reg ( ireg, fourth_byte >> 3 );
+   if (dis)
+      VG_(printf)("\n\t\tsse-0x%x:0x%x:0x%x:0x%x-(%s)\n", 
+                  (UInt)first_byte, (UInt)second_byte, 
+                  (UInt)third_byte, (UInt)fourth_byte,
+                  nameIReg(4,ireg) );
+}
+
+static void emit_SSE3g ( FlagSet uses_sflags, 
+                         FlagSet sets_sflags,
+                         UChar first_byte, 
+                         UChar second_byte, 
+			 UChar third_byte,
+			 UChar fourth_byte,
+                         Int ireg )
+{
+   VG_(new_emit)(True, uses_sflags, sets_sflags);
+   VG_(emitB) ( first_byte );
+   VG_(emitB) ( second_byte );
+   VG_(emitB) ( third_byte );
+   fourth_byte &= 0x38; /* mask out mod and rm fields */
+   fourth_byte |= 0xC0; /* set top two bits: mod = 11b */
+   fourth_byte |= (ireg & 7); /* patch in our ireg */
+   VG_(emitB) ( fourth_byte );
+   if (dis)
+      VG_(printf)("\n\t\tsse-reg-to-xmmreg--0x%x:0x%x:0x%x:0x%x-(%s)\n", 
+                  (UInt)first_byte, (UInt)second_byte, 
+                  (UInt)third_byte, (UInt)fourth_byte,
+                  nameIReg(4,ireg) );
+}
+
+static void emit_SSE4 ( FlagSet uses_sflags, 
+                         FlagSet sets_sflags,
+                         UChar first_byte, 
+                         UChar second_byte, 
+			 UChar third_byte,
+                         UChar fourth_byte )
+{
+   VG_(new_emit)(True, uses_sflags, sets_sflags);
+   VG_(emitB) ( first_byte );
+   VG_(emitB) ( second_byte );
+   VG_(emitB) ( third_byte );
+   VG_(emitB) ( fourth_byte );
+   if (dis)
+      VG_(printf)("\n\t\tsse-0x%x:0x%x:0x%x:0x%x\n", 
+                  (UInt)first_byte, (UInt)second_byte, 
+                  (UInt)third_byte, (UInt)fourth_byte );
+}
+
 static void emit_MMX2_reg_to_mmxreg ( FlagSet uses_sflags, 
                                       FlagSet sets_sflags,
 			              UChar first_byte, 
@@ -3593,6 +3673,78 @@
                              u->val2 & 0xFF );
          break;
 
+      case SSE2a_MemWr:
+      case SSE2a_MemRd:
+         vg_assert(u->size == 4 || u->size == 16);
+         vg_assert(u->tag1 == Lit16);
+         vg_assert(u->tag2 == Lit16);
+         vg_assert(u->tag3 == RealReg);
+         vg_assert(!anyFlagUse(u));
+         if (!(*sselive)) {
+            emit_get_sse_state();
+            *sselive = True;
+         }
+         emit_SSE2a ( u->flags_r, u->flags_w,
+                      (u->val1 >> 8) & 0xFF,
+                      u->val1 & 0xFF,
+                      u->val2 & 0xFF,
+                      u->val3 );
+         break;
+
+      case SSE3a_MemWr:
+      case SSE3a_MemRd:
+         vg_assert(u->size == 4 || u->size == 16);
+         vg_assert(u->tag1 == Lit16);
+         vg_assert(u->tag2 == Lit16);
+         vg_assert(u->tag3 == RealReg);
+         vg_assert(!anyFlagUse(u));
+         if (!(*sselive)) {
+            emit_get_sse_state();
+            *sselive = True;
+         }
+         emit_SSE3a ( u->flags_r, u->flags_w,
+                      (u->val1 >> 8) & 0xFF,
+                      u->val1 & 0xFF,
+                      (u->val2 >> 8) & 0xFF,
+                      u->val2 & 0xFF,
+                      u->val3 );
+         break;
+
+      case SSE3g_RegRd:
+         vg_assert(u->size == 4);
+         vg_assert(u->tag1 == Lit16);
+         vg_assert(u->tag2 == Lit16);
+         vg_assert(u->tag3 == RealReg);
+         vg_assert(!anyFlagUse(u));
+         if (!(*sselive)) {
+            emit_get_sse_state();
+            *sselive = True;
+         }
+         emit_SSE3g ( u->flags_r, u->flags_w,
+                      (u->val1 >> 8) & 0xFF,
+                      u->val1 & 0xFF,
+                      (u->val2 >> 8) & 0xFF,
+                      u->val2 & 0xFF,
+                      u->val3 );
+         break;
+
+      case SSE4:
+         vg_assert(u->size == 0);
+         vg_assert(u->tag1 == Lit16);
+         vg_assert(u->tag2 == Lit16);
+         vg_assert(u->tag3 == NoValue);
+         vg_assert(!anyFlagUse(u));
+         if (!(*sselive)) {
+            emit_get_sse_state();
+            *sselive = True;
+         }
+         emit_SSE4 ( u->flags_r, u->flags_w,
+                     (u->val1 >> 8) & 0xFF,
+                     u->val1 & 0xFF,
+                     (u->val2 >> 8) & 0xFF,
+                     u->val2 & 0xFF );
+         break;
+
       default: 
          if (VG_(needs).extended_UCode) {
 	    if (*sselive) {