SSE/SSE2 fixes needed to run the entire test suite of the GNU
Scientific Library (gsl-1.4) compiled with Intel Icc 7.1 20030307Z '-g
-O -xW'.  I think this gives pretty good coverage of SSE/SSE2 floating
point instructions, or at least the subset emitted by Icc.  So far
tested on memcheck and nulgrind; addrcheck and cachesim still testing.

MERGE TO STABLE


git-svn-id: svn://svn.valgrind.org/valgrind/trunk@1955 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/addrcheck/ac_main.c b/addrcheck/ac_main.c
index 0ff1b1e..e0f2305 100644
--- a/addrcheck/ac_main.c
+++ b/addrcheck/ac_main.c
@@ -979,15 +979,16 @@
          case NOP:  case LOCK:  case CALLM_E:  case CALLM_S:
             break;
 
-         /* For memory-ref instrs, copy the data_addr into a temporary to be
-          * passed to the helper at the end of the instruction.
+         /* For memory-ref instrs, copy the data_addr into a temporary
+          * to be passed to the helper at the end of the instruction.
           */
          case LOAD:
             switch (u_in->size) {
                case 4:  helper = (Addr)ac_helperc_LOAD4; break;
                case 2:  helper = (Addr)ac_helperc_LOAD2; break;
                case 1:  helper = (Addr)ac_helperc_LOAD1; break;
-               default: VG_(skin_panic)("addrcheck::SK_(instrument):LOAD");
+               default: VG_(skin_panic)
+                           ("addrcheck::SK_(instrument):LOAD");
             }
             uInstr1(cb, CCALL, 0, TempReg, u_in->val1);
             uCCall (cb, helper, 1, 1, False );
@@ -999,7 +1000,8 @@
                case 4:  helper = (Addr)ac_helperc_STORE4; break;
                case 2:  helper = (Addr)ac_helperc_STORE2; break;
                case 1:  helper = (Addr)ac_helperc_STORE1; break;
-               default: VG_(skin_panic)("addrcheck::SK_(instrument):STORE");
+               default: VG_(skin_panic)
+                           ("addrcheck::SK_(instrument):STORE");
             }
             uInstr1(cb, CCALL, 0, TempReg, u_in->val2);
             uCCall (cb, helper, 1, 1, False );
@@ -1046,7 +1048,7 @@
             VG_(copy_UInstr)(cb, u_in);
             break;
 
-         case SSE3a_MemRd: // this one causes trouble
+         case SSE3a_MemRd:
          case SSE2a_MemRd:
             helper = (Addr)ac_fpu_READ_check;
 	    goto do_Access_ARG3;
@@ -1055,7 +1057,8 @@
             helper = (Addr)ac_fpu_WRITE_check;
 	    goto do_Access_ARG3;
          do_Access_ARG3:
-	    sk_assert(u_in->size == 4 || u_in->size == 8 || u_in->size == 16);
+	    sk_assert(u_in->size == 4 
+                      || u_in->size == 8 || u_in->size == 16);
             sk_assert(u_in->tag3 == TempReg);
             t_addr = u_in->val3;
             t_size = newTemp(cb);
@@ -1066,10 +1069,8 @@
             VG_(copy_UInstr)(cb, u_in);
             break;
 
-	    //         case SSE2a1_MemRd:
-	    //         case SSE2a1_MemWr:
-	   //         case SSE3a1_MemRd:
-	   //         case SSE3a1_MemWr:
+         case SSE2a1_MemRd:
+         case SSE3a1_MemRd:
 	    VG_(pp_UInstr)(0,u_in);
 	    VG_(skin_panic)("AddrCheck: unhandled SSE uinstr");
 	    break;
@@ -1081,6 +1082,7 @@
          case SSE3g_RegWr:
          case SSE3e_RegRd:
          case SSE4:
+         case SSE3:
          default:
             VG_(copy_UInstr)(cb, u_in);
             break;
diff --git a/coregrind/vg_from_ucode.c b/coregrind/vg_from_ucode.c
index 94fd728..e5406c9 100644
--- a/coregrind/vg_from_ucode.c
+++ b/coregrind/vg_from_ucode.c
@@ -1583,6 +1583,30 @@
       );
 }
 
+static void emit_SSE3a1 ( FlagSet uses_sflags, 
+                          FlagSet sets_sflags,
+                          UChar first_byte, 
+                          UChar second_byte, 
+ 			  UChar third_byte,
+ 			  UChar fourth_byte,
+ 			  UChar fifth_byte,
+                          Int ireg )
+{
+   VG_(new_emit)(True, uses_sflags, sets_sflags);
+   VG_(emitB) ( first_byte );
+   VG_(emitB) ( second_byte );
+   VG_(emitB) ( third_byte );
+   fourth_byte &= 0x38; /* mask out mod and rm fields */
+   emit_amode_regmem_reg ( ireg, fourth_byte >> 3 );
+   VG_(emitB) ( fifth_byte );
+   if (dis)
+      VG_(printf)("\n\t\tsse3a1-0x%x:0x%x:0x%x:0x%x:0x%x-(%s)\n", 
+                  (UInt)first_byte, (UInt)second_byte, 
+                  (UInt)third_byte, (UInt)fourth_byte,
+                  (UInt)fifth_byte,
+                  nameIReg(4,ireg) );
+}
+
 static void emit_SSE4 ( FlagSet uses_sflags, 
                         FlagSet sets_sflags,
                         UChar first_byte, 
@@ -4063,6 +4087,25 @@
                        u->val3 );
          break;
 
+      case SSE3a1_MemRd:
+         vg_assert(u->size == 16);
+         vg_assert(u->tag1 == Lit16);
+         vg_assert(u->tag2 == Lit16);
+         vg_assert(u->tag3 == RealReg);
+         vg_assert(!anyFlagUse(u));
+         if (!(*sselive)) {
+            emit_get_sse_state();
+            *sselive = True;
+         }
+         emit_SSE3a1 ( u->flags_r, u->flags_w,
+                      (u->val1 >> 8) & 0xFF,
+                      u->val1 & 0xFF,
+                      (u->val2 >> 8) & 0xFF,
+                      u->val2 & 0xFF,
+                      (u->lit32 >> 8) & 0xFF,
+                      u->val3 );
+         break;
+
       case SSE5:
          vg_assert(u->size == 0);
          vg_assert(u->tag1 == Lit16);
@@ -4103,7 +4146,7 @@
          vg_assert(u->tag1 == Lit16);
          vg_assert(u->tag2 == Lit16);
          vg_assert(u->tag3 == NoValue);
-         vg_assert(!anyFlagUse(u));
+         vg_assert(!readFlagUse(u));
          if (!(*sselive)) {
             emit_get_sse_state();
             *sselive = True;
diff --git a/coregrind/vg_to_ucode.c b/coregrind/vg_to_ucode.c
index af42790..d14a7e0 100644
--- a/coregrind/vg_to_ucode.c
+++ b/coregrind/vg_to_ucode.c
@@ -3453,6 +3453,7 @@
 				UChar opc2,
                                 UChar opc3 )
 {
+   UChar dis_buf[50];
    UChar modrm = getUChar(eip);
    UChar imm8;
    if (epartIsReg(modrm)) {
@@ -3469,7 +3470,21 @@
                      nameXMMReg(gregOfRM(modrm)), (Int)imm8 );
       eip++;
    } else {
-      VG_(core_panic)("dis_SSE3_reg_or_mem_Imm8: mem");
+      UInt pair = disAMode ( cb, sorb, eip, dis?dis_buf:NULL );
+      Int  tmpa = LOW24(pair);
+      eip += HI8(pair);
+      imm8 = getUChar(eip);
+      eip++;
+      uInstr3(cb, SSE3a1_MemRd, sz,
+                  Lit16, (((UShort)(opc1)) << 8) | ((UShort)opc2),
+                  Lit16, (((UShort)(opc3)) << 8) | ((UShort)modrm),
+                  TempReg, tmpa);
+      uLiteral(cb, imm8);
+      if (dis)
+         VG_(printf)("%s %s, %s, $%d\n", 
+                     name,
+                     dis_buf,
+                     nameXMMReg(gregOfRM(modrm)), (Int)imm8 );
    }
    return eip;
 }
@@ -3850,11 +3865,17 @@
       goto decode_success;
    }
 
-   /* CMPPS -- compare packed floats */
+   /* sz==4: CMPPS -- compare packed floats */
+   /* sz==2: CMPPD -- compare packed doubles */
    if (insn[0] == 0x0F && insn[1] == 0xC2) {
-      vg_assert(sz == 4);
-      eip = dis_SSE2_reg_or_mem_Imm8 ( cb, sorb, eip+2, 16, "cmpps",
-                                       insn[0], insn[1] );
+      vg_assert(sz == 4 || sz == 2);
+      if (sz == 4) {
+         eip = dis_SSE2_reg_or_mem_Imm8 ( cb, sorb, eip+2, 16, "cmpps",
+                                          insn[0], insn[1] );
+      } else {
+         eip = dis_SSE3_reg_or_mem_Imm8 ( cb, sorb, eip+2, 16, "cmppd",
+                                          0x66, insn[0], insn[1] );
+      }
       goto decode_success;
    }
 
@@ -3989,9 +4010,13 @@
    }
 
    /* 0xF2: MINSD */
-   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5D) {
+   /* 0xF3: MINSS */
+   if ((insn[0] == 0xF2 || insn[0] == 0xF3) 
+       && insn[1] == 0x0F && insn[2] == 0x5D) {
+      Bool sz8 = insn[0] == 0xF2;
       vg_assert(sz == 4);
-      eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+3, 8, "minsd",
+      eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+3, sz8 ? 8 : 4, 
+                                      sz8 ? "minsd" : "minss",
                                       insn[0], insn[1], insn[2] );
       goto decode_success;
    }
@@ -4412,10 +4437,14 @@
       goto decode_success;
    }
 
-   /* MOVAPS (28,29) -- aligned load/store of xmm reg, or xmm-xmm reg
-      move */
-   /* MOVUPS (10,11) -- unaligned load/store of xmm reg, or xmm-xmm
-      reg move */
+   /* sz==4: MOVAPS (28,29) -- aligned load/store of xmm reg, or
+      xmm-xmm reg move */
+   /* sz==4: MOVUPS (10,11) -- unaligned load/store of xmm reg, or
+      xmm-xmm reg move */
+   /* sz==2: MOVAPD (28,29) -- aligned load/store of xmm reg, or
+      xmm-xmm reg move */
+   /* sz==2: MOVUPD (10,11) -- unaligned load/store of xmm reg, or
+      xmm-xmm reg move */
    if (insn[0] == 0x0F && (insn[1] == 0x28
                            || insn[1] == 0x29
                            || insn[1] == 0x10
@@ -4423,10 +4452,16 @@
       UChar* name = (insn[1] == 0x10 || insn[1] == 0x11)
                     ? "movups" : "movaps";
       Bool store = insn[1] == 0x29 || insn[1] == 11;
-      vg_assert(sz == 4);
-      eip = dis_SSE2_load_store_or_mov
-               ( cb, sorb, eip+2, 16, store, name,
-                     insn[0], insn[1] );
+      vg_assert(sz == 2 || sz == 4);
+      if (sz == 4) {
+         eip = dis_SSE2_load_store_or_mov
+                  ( cb, sorb, eip+2, 16, store, name,
+                        insn[0], insn[1] );
+      } else {
+         eip = dis_SSE3_load_store_or_mov
+                  ( cb, sorb, eip+2, 16, store, name,
+                        0x66, insn[0], insn[1] );
+      }
       goto decode_success;
    }
 
@@ -4449,7 +4484,7 @@
       /* Cannot be used for reg-reg moves, according to Intel docs. */
       vg_assert(!epartIsReg(insn[2]));
       eip = dis_SSE3_load_store_or_mov
-               (cb, sorb, eip+2, 16, is_store, "movlpd", 
+               (cb, sorb, eip+2, 8, is_store, "movlpd", 
                     0x66, insn[0], insn[1] );
       goto decode_success;
    }
@@ -4559,6 +4594,124 @@
       goto decode_success;
    }
 
+   /* SQRTSD: square root of scalar double. */
+   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x51) {
+      vg_assert(sz == 4);
+      eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+3, 8, 
+                                      "sqrtsd",
+                                      insn[0], insn[1], insn[2] );
+      goto decode_success;
+   }
+
+   /* MOVLPS -- 8-byte load/store.  How is this different from MOVLPS
+      ? */
+   if (insn[0] == 0x0F 
+       && (insn[1] == 0x12 || insn[1] == 0x13)) {
+      Bool is_store = insn[1]==0x13;
+      vg_assert(sz == 4);
+      /* Cannot be used for reg-reg moves, according to Intel docs. */
+      //      vg_assert(!epartIsReg(insn[2]));
+      eip = dis_SSE2_load_store_or_mov
+               (cb, sorb, eip+2, 8, is_store, "movlps", 
+                    insn[0], insn[1] );
+      goto decode_success;
+   }
+
+   /* 0xF3: RCPSS -- reciprocal of scalar float */
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x53) {
+      vg_assert(sz == 4);
+      eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+3, 4, 
+                                      "rcpss",
+                                      insn[0], insn[1], insn[2] );
+      goto decode_success;
+   }
+
+   /* MOVMSKPD -- extract 2 sign bits from a xmm reg and copy them to 
+      an ireg.  Top 30 bits of ireg are set to zero. */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x50) {
+      modrm = insn[2];
+      /* Intel docs don't say anything about a memory source being
+	 allowed here. */
+      vg_assert(epartIsReg(modrm));
+      t1 = newTemp(cb);
+      uInstr3(cb, SSE3g_RegWr, 4,
+                  Lit16, (((UShort)0x66) << 8) | (UShort)insn[0],
+                  Lit16, (((UShort)insn[1]) << 8) | (UShort)modrm,
+                  TempReg, t1 );
+      uInstr2(cb, PUT, 4, TempReg, t1, ArchReg, gregOfRM(modrm));
+      if (dis)
+         VG_(printf)("movmskpd %s, %s\n", 
+                      nameXMMReg(eregOfRM(modrm)),
+                      nameIReg(4,gregOfRM(modrm)));
+      eip += 3;
+      goto decode_success;
+   }
+
+   /* ANDNPS */
+   /* 0x66: ANDNPD (src)xmmreg-or-mem, (dst)xmmreg */
+   if (insn[0] == 0x0F && insn[1] == 0x55) {
+      vg_assert(sz == 4 || sz == 2);
+      if (sz == 4) {
+         eip = dis_SSE2_reg_or_mem ( cb, sorb, eip+2, 16, "andnps",
+                                         insn[0], insn[1] );
+      } else {
+         eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+2, 16, "andnpd",
+                                         0x66, insn[0], insn[1] );
+      }
+      goto decode_success;
+   }
+
+   /* MOVHPD -- 8-byte load/store. */
+   if (sz == 2 
+       && insn[0] == 0x0F 
+       && (insn[1] == 0x16 || insn[1] == 0x17)) {
+      Bool is_store = insn[1]==0x17;
+      /* Cannot be used for reg-reg moves, according to Intel docs. */
+      vg_assert(!epartIsReg(insn[2]));
+      eip = dis_SSE3_load_store_or_mov
+               (cb, sorb, eip+2, 8, is_store, "movhpd", 
+                    0x66, insn[0], insn[1] );
+      goto decode_success;
+   }
+
+   /* PMOVMSKB -- extract 16 sign bits from a xmm reg and copy them to 
+      an ireg.  Top 16 bits of ireg are set to zero. */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD7) {
+      modrm = insn[2];
+      /* Intel docs don't say anything about a memory source being
+	 allowed here. */
+      vg_assert(epartIsReg(modrm));
+      t1 = newTemp(cb);
+      uInstr3(cb, SSE3g_RegWr, 4,
+                  Lit16, (((UShort)0x66) << 8) | (UShort)insn[0],
+                  Lit16, (((UShort)insn[1]) << 8) | (UShort)modrm,
+                  TempReg, t1 );
+      uInstr2(cb, PUT, 4, TempReg, t1, ArchReg, gregOfRM(modrm));
+      if (dis)
+         VG_(printf)("pmovmskb %s, %s\n", 
+                      nameXMMReg(eregOfRM(modrm)),
+                      nameIReg(4,gregOfRM(modrm)));
+      eip += 3;
+      goto decode_success;
+   }
+
+   /* CVTDQ2PD -- convert one single double. to float. */
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xE6) {
+      vg_assert(sz == 4);
+      eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+3, 8, "cvtdq2pd",
+                                      insn[0], insn[1], insn[2] );
+      goto decode_success;
+   }
+
+   /* SQRTPD: square root of packed double. */
+   if (sz == 2
+       && insn[0] == 0x0F && insn[1] == 0x51) {
+      eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+2, 16, 
+                                      "sqrtpd",
+                                      0x66, insn[0], insn[1] );
+      goto decode_success;
+   }
+
    /* Fall through into the non-SSE decoder. */
 
    } /* if (VG_(have_ssestate)) */
diff --git a/coregrind/vg_translate.c b/coregrind/vg_translate.c
index 6113df2..d33e0db 100644
--- a/coregrind/vg_translate.c
+++ b/coregrind/vg_translate.c
@@ -405,6 +405,7 @@
 #  define LIT8 (((u->lit32) & 0xFFFFFF00) == 0)
 #  define LIT1 (!(LIT0))
 #  define LITm (u->tag1 == Literal ? True : LIT0 )
+#  define SZ16 (u->size == 16)
 #  define SZ8  (u->size == 8)
 #  define SZ4  (u->size == 4)
 #  define SZ2  (u->size == 2)
@@ -569,10 +570,11 @@
    case SSE3a_MemRd:  return LIT0 && SZsse && CCa  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE3e_RegRd:  return LIT0 && SZ4   && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE3e_RegWr:  return LIT0 && SZ4   && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
+   case SSE3a1_MemRd: return LIT8 && SZ16  && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE3g_RegWr:  return LIT0 && SZ4   && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE3g1_RegWr: return LIT8 && SZ4   && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
    case SSE3e1_RegRd: return LIT8 && SZ2   && CC0  && Ls1 && Ls2 && TR3 && XOTHER;
-   case SSE3:         return LIT0 && SZ0   && CC0  && Ls1 && Ls2 && N3  && XOTHER;
+   case SSE3:         return LIT0 && SZ0   && CCa  && Ls1 && Ls2 && N3  && XOTHER;
    case SSE4:         return LIT0 && SZ0   && CCa  && Ls1 && Ls2 && N3  && XOTHER;
    case SSE5:         return LIT0 && SZ0   && CC0  && Ls1 && Ls2 && Ls3 && XOTHER;
    case SSE3ag_MemRd_RegWr:
@@ -591,6 +593,7 @@
 #  undef LIT1
 #  undef LIT8
 #  undef LITm
+#  undef SZ16
 #  undef SZ8
 #  undef SZ4
 #  undef SZ2
@@ -896,6 +899,7 @@
       case SSE3e_RegRd: return "SSE3e_RRd";
       case SSE3e_RegWr: return "SSE3e_RWr";
       case SSE3g_RegWr: return "SSE3g_RWr";
+      case SSE3a1_MemRd: return "SSE3a1_MRd";
       case SSE3g1_RegWr: return "SSE3g1_RWr";
       case SSE3e1_RegRd: return "SSE3e1_RRd";
       case SSE3:        return "SSE3";
@@ -1081,6 +1085,7 @@
 
       case SSE3g1_RegWr:
       case SSE3e1_RegRd:
+      case SSE3a1_MemRd:
          VG_(printf)("0x%x:0x%x:0x%x:0x%x:0x%x",
                      (u->val1 >> 8) & 0xFF, u->val1 & 0xFF, 
                      (u->val2 >> 8) & 0xFF, u->val2 & 0xFF,
@@ -1260,6 +1265,7 @@
       case LEA1: RD(1); WR(2); break;
       case LEA2: RD(1); RD(2); WR(3); break;
 
+      case SSE3a1_MemRd:
       case SSE2a1_MemRd:
       case SSE3e_RegRd:
       case SSE3a_MemWr:
@@ -1432,7 +1438,7 @@
       case MMX2_MemRd: case MMX2_MemWr:
       case MMX2_ERegRd: case MMX2_ERegWr:
       case SSE2a_MemWr: case SSE2a_MemRd: case SSE2a1_MemRd:
-      case SSE3a_MemWr: case SSE3a_MemRd:
+      case SSE3a_MemWr: case SSE3a_MemRd: case SSE3a1_MemRd:
       case SSE3e_RegRd: case SSE3g_RegWr: case SSE3e_RegWr:
       case SSE3g1_RegWr: case SSE3e1_RegRd:
       case SSE4: case SSE3: case SSE5: case SSE3ag_MemRd_RegWr:
diff --git a/include/vg_skin.h b/include/vg_skin.h
index ba16315..081bd6c 100644
--- a/include/vg_skin.h
+++ b/include/vg_skin.h
@@ -658,9 +658,7 @@
          holding the address.  Arg3 holds this Temp/Real Reg.
          Transfer is at stated size.  */
       SSE2a1_MemRd,
-#if 0
-      SSE2a1_MemWr,
-#endif
+
       /* 4 bytes, writes an integer register.  Insns of the form
          bbbbbbbb:bbbbbbbb:bbbbbbbb:11 ireg bbb.
          Held in val1[15:0] and val2[15:0], and ireg is to be replaced
@@ -718,7 +716,7 @@
       /* 5 bytes, no memrefs, no iregdefs, copy exactly to the
          output.  Held in val1[15:0], val2[15:0] and val3[7:0]. */
       SSE5,
-#if 0
+
       /* 5 bytes, reads/writes mem.  Insns of the form
          bbbbbbbb:bbbbbbbb:bbbbbbbb:mod mmxreg r/m:bbbbbbbb
          Held in val1[15:0], val2[15:0], lit32[7:0].
@@ -726,8 +724,7 @@
          to the Temp/RealReg holding the address.  Arg3 holds this
          Temp/Real Reg.  Transfer is always at size 16.  */
       SSE3a1_MemRd,
-      SSE3a1_MemWr,
-#endif
+
       /* ------------------------ */
 
       /* Not strictly needed, but improve address calculation translations. */
diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c
index 4d9e0f4..e88f442 100644
--- a/memcheck/mc_translate.c
+++ b/memcheck/mc_translate.c
@@ -1107,7 +1107,8 @@
          case SSE3a_MemRd:
          case SSE3a_MemWr:
          case SSE2a_MemWr:
-         case SSE2a_MemRd: { 
+         case SSE2a_MemRd:
+         case SSE3a1_MemRd: { 
             Bool is_load;
             Int t_size;
 
@@ -1116,7 +1117,8 @@
 
             t_size = INVALID_TEMPREG;
             is_load = u_in->opcode==SSE2a_MemRd 
-                      || u_in->opcode==SSE3a_MemRd;
+                      || u_in->opcode==SSE3a_MemRd
+                      || u_in->opcode==SSE3a1_MemRd;
 
             sk_assert(u_in->tag3 == TempReg);