SSE/SSE2 fixes needed to run the entire test suite of the GNU
Scientific Library (gsl-1.4) compiled with Intel Icc 7.1 20030307Z '-g
-O -xW'. I think this gives pretty good coverage of SSE/SSE2 floating
point instructions, or at least the subset emitted by Icc. So far
tested on memcheck and nulgrind; addrcheck and cachesim still testing.
MERGE TO STABLE
git-svn-id: svn://svn.valgrind.org/valgrind/trunk@1955 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/addrcheck/ac_main.c b/addrcheck/ac_main.c
index 0ff1b1e..e0f2305 100644
--- a/addrcheck/ac_main.c
+++ b/addrcheck/ac_main.c
@@ -979,15 +979,16 @@
case NOP: case LOCK: case CALLM_E: case CALLM_S:
break;
- /* For memory-ref instrs, copy the data_addr into a temporary to be
- * passed to the helper at the end of the instruction.
+ /* For memory-ref instrs, copy the data_addr into a temporary
+ * to be passed to the helper at the end of the instruction.
*/
case LOAD:
switch (u_in->size) {
case 4: helper = (Addr)ac_helperc_LOAD4; break;
case 2: helper = (Addr)ac_helperc_LOAD2; break;
case 1: helper = (Addr)ac_helperc_LOAD1; break;
- default: VG_(skin_panic)("addrcheck::SK_(instrument):LOAD");
+ default: VG_(skin_panic)
+ ("addrcheck::SK_(instrument):LOAD");
}
uInstr1(cb, CCALL, 0, TempReg, u_in->val1);
uCCall (cb, helper, 1, 1, False );
@@ -999,7 +1000,8 @@
case 4: helper = (Addr)ac_helperc_STORE4; break;
case 2: helper = (Addr)ac_helperc_STORE2; break;
case 1: helper = (Addr)ac_helperc_STORE1; break;
- default: VG_(skin_panic)("addrcheck::SK_(instrument):STORE");
+ default: VG_(skin_panic)
+ ("addrcheck::SK_(instrument):STORE");
}
uInstr1(cb, CCALL, 0, TempReg, u_in->val2);
uCCall (cb, helper, 1, 1, False );
@@ -1046,7 +1048,7 @@
VG_(copy_UInstr)(cb, u_in);
break;
- case SSE3a_MemRd: // this one causes trouble
+ case SSE3a_MemRd:
case SSE2a_MemRd:
helper = (Addr)ac_fpu_READ_check;
goto do_Access_ARG3;
@@ -1055,7 +1057,8 @@
helper = (Addr)ac_fpu_WRITE_check;
goto do_Access_ARG3;
do_Access_ARG3:
- sk_assert(u_in->size == 4 || u_in->size == 8 || u_in->size == 16);
+ sk_assert(u_in->size == 4
+ || u_in->size == 8 || u_in->size == 16);
sk_assert(u_in->tag3 == TempReg);
t_addr = u_in->val3;
t_size = newTemp(cb);
@@ -1066,10 +1069,8 @@
VG_(copy_UInstr)(cb, u_in);
break;
- // case SSE2a1_MemRd:
- // case SSE2a1_MemWr:
- // case SSE3a1_MemRd:
- // case SSE3a1_MemWr:
+ case SSE2a1_MemRd:
+ case SSE3a1_MemRd:
VG_(pp_UInstr)(0,u_in);
VG_(skin_panic)("AddrCheck: unhandled SSE uinstr");
break;
@@ -1081,6 +1082,7 @@
case SSE3g_RegWr:
case SSE3e_RegRd:
case SSE4:
+ case SSE3:
default:
VG_(copy_UInstr)(cb, u_in);
break;
diff --git a/coregrind/vg_from_ucode.c b/coregrind/vg_from_ucode.c
index 94fd728..e5406c9 100644
--- a/coregrind/vg_from_ucode.c
+++ b/coregrind/vg_from_ucode.c
@@ -1583,6 +1583,30 @@
);
}
+static void emit_SSE3a1 ( FlagSet uses_sflags,
+ FlagSet sets_sflags,
+ UChar first_byte,
+ UChar second_byte,
+ UChar third_byte,
+ UChar fourth_byte,
+ UChar fifth_byte,
+ Int ireg )
+{
+ VG_(new_emit)(True, uses_sflags, sets_sflags);
+ VG_(emitB) ( first_byte );
+ VG_(emitB) ( second_byte );
+ VG_(emitB) ( third_byte );
+ fourth_byte &= 0x38; /* mask out mod and rm fields */
+ emit_amode_regmem_reg ( ireg, fourth_byte >> 3 );
+ VG_(emitB) ( fifth_byte );
+ if (dis)
+ VG_(printf)("\n\t\tsse3a1-0x%x:0x%x:0x%x:0x%x:0x%x-(%s)\n",
+ (UInt)first_byte, (UInt)second_byte,
+ (UInt)third_byte, (UInt)fourth_byte,
+ (UInt)fifth_byte,
+ nameIReg(4,ireg) );
+}
+
static void emit_SSE4 ( FlagSet uses_sflags,
FlagSet sets_sflags,
UChar first_byte,
@@ -4063,6 +4087,25 @@
u->val3 );
break;
+ case SSE3a1_MemRd:
+ vg_assert(u->size == 16);
+ vg_assert(u->tag1 == Lit16);
+ vg_assert(u->tag2 == Lit16);
+ vg_assert(u->tag3 == RealReg);
+ vg_assert(!anyFlagUse(u));
+ if (!(*sselive)) {
+ emit_get_sse_state();
+ *sselive = True;
+ }
+ emit_SSE3a1 ( u->flags_r, u->flags_w,
+ (u->val1 >> 8) & 0xFF,
+ u->val1 & 0xFF,
+ (u->val2 >> 8) & 0xFF,
+ u->val2 & 0xFF,
+ (u->lit32 >> 8) & 0xFF,
+ u->val3 );
+ break;
+
case SSE5:
vg_assert(u->size == 0);
vg_assert(u->tag1 == Lit16);
@@ -4103,7 +4146,7 @@
vg_assert(u->tag1 == Lit16);
vg_assert(u->tag2 == Lit16);
vg_assert(u->tag3 == NoValue);
- vg_assert(!anyFlagUse(u));
+ vg_assert(!readFlagUse(u));
if (!(*sselive)) {
emit_get_sse_state();
*sselive = True;
diff --git a/coregrind/vg_to_ucode.c b/coregrind/vg_to_ucode.c
index af42790..d14a7e0 100644
--- a/coregrind/vg_to_ucode.c
+++ b/coregrind/vg_to_ucode.c
@@ -3453,6 +3453,7 @@
UChar opc2,
UChar opc3 )
{
+ UChar dis_buf[50];
UChar modrm = getUChar(eip);
UChar imm8;
if (epartIsReg(modrm)) {
@@ -3469,7 +3470,21 @@
nameXMMReg(gregOfRM(modrm)), (Int)imm8 );
eip++;
} else {
- VG_(core_panic)("dis_SSE3_reg_or_mem_Imm8: mem");
+ UInt pair = disAMode ( cb, sorb, eip, dis?dis_buf:NULL );
+ Int tmpa = LOW24(pair);
+ eip += HI8(pair);
+ imm8 = getUChar(eip);
+ eip++;
+ uInstr3(cb, SSE3a1_MemRd, sz,
+ Lit16, (((UShort)(opc1)) << 8) | ((UShort)opc2),
+ Lit16, (((UShort)(opc3)) << 8) | ((UShort)modrm),
+ TempReg, tmpa);
+ uLiteral(cb, imm8);
+ if (dis)
+ VG_(printf)("%s %s, %s, $%d\n",
+ name,
+ dis_buf,
+ nameXMMReg(gregOfRM(modrm)), (Int)imm8 );
}
return eip;
}
@@ -3850,11 +3865,17 @@
goto decode_success;
}
- /* CMPPS -- compare packed floats */
+ /* sz==4: CMPPS -- compare packed floats */
+ /* sz==2: CMPPD -- compare packed doubles */
if (insn[0] == 0x0F && insn[1] == 0xC2) {
- vg_assert(sz == 4);
- eip = dis_SSE2_reg_or_mem_Imm8 ( cb, sorb, eip+2, 16, "cmpps",
- insn[0], insn[1] );
+ vg_assert(sz == 4 || sz == 2);
+ if (sz == 4) {
+ eip = dis_SSE2_reg_or_mem_Imm8 ( cb, sorb, eip+2, 16, "cmpps",
+ insn[0], insn[1] );
+ } else {
+ eip = dis_SSE3_reg_or_mem_Imm8 ( cb, sorb, eip+2, 16, "cmppd",
+ 0x66, insn[0], insn[1] );
+ }
goto decode_success;
}
@@ -3989,9 +4010,13 @@
}
/* 0xF2: MINSD */
- if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5D) {
+ /* 0xF3: MINSS */
+ if ((insn[0] == 0xF2 || insn[0] == 0xF3)
+ && insn[1] == 0x0F && insn[2] == 0x5D) {
+ Bool sz8 = insn[0] == 0xF2;
vg_assert(sz == 4);
- eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+3, 8, "minsd",
+ eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+3, sz8 ? 8 : 4,
+ sz8 ? "minsd" : "minss",
insn[0], insn[1], insn[2] );
goto decode_success;
}
@@ -4412,10 +4437,14 @@
goto decode_success;
}
- /* MOVAPS (28,29) -- aligned load/store of xmm reg, or xmm-xmm reg
- move */
- /* MOVUPS (10,11) -- unaligned load/store of xmm reg, or xmm-xmm
- reg move */
+ /* sz==4: MOVAPS (28,29) -- aligned load/store of xmm reg, or
+ xmm-xmm reg move */
+ /* sz==4: MOVUPS (10,11) -- unaligned load/store of xmm reg, or
+ xmm-xmm reg move */
+ /* sz==2: MOVAPD (28,29) -- aligned load/store of xmm reg, or
+ xmm-xmm reg move */
+ /* sz==2: MOVUPD (10,11) -- unaligned load/store of xmm reg, or
+ xmm-xmm reg move */
if (insn[0] == 0x0F && (insn[1] == 0x28
|| insn[1] == 0x29
|| insn[1] == 0x10
@@ -4423,10 +4452,16 @@
UChar* name = (insn[1] == 0x10 || insn[1] == 0x11)
? "movups" : "movaps";
Bool store = insn[1] == 0x29 || insn[1] == 11;
- vg_assert(sz == 4);
- eip = dis_SSE2_load_store_or_mov
- ( cb, sorb, eip+2, 16, store, name,
- insn[0], insn[1] );
+ vg_assert(sz == 2 || sz == 4);
+ if (sz == 4) {
+ eip = dis_SSE2_load_store_or_mov
+ ( cb, sorb, eip+2, 16, store, name,
+ insn[0], insn[1] );
+ } else {
+ eip = dis_SSE3_load_store_or_mov
+ ( cb, sorb, eip+2, 16, store, name,
+ 0x66, insn[0], insn[1] );
+ }
goto decode_success;
}
@@ -4449,7 +4484,7 @@
/* Cannot be used for reg-reg moves, according to Intel docs. */
vg_assert(!epartIsReg(insn[2]));
eip = dis_SSE3_load_store_or_mov
- (cb, sorb, eip+2, 16, is_store, "movlpd",
+ (cb, sorb, eip+2, 8, is_store, "movlpd",
0x66, insn[0], insn[1] );
goto decode_success;
}
@@ -4559,6 +4594,124 @@
goto decode_success;
}
+ /* SQRTSD: square root of scalar double. */
+ if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x51) {
+ vg_assert(sz == 4);
+ eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+3, 8,
+ "sqrtsd",
+ insn[0], insn[1], insn[2] );
+ goto decode_success;
+ }
+
+ /* MOVLPS -- 8-byte load/store. How is this different from MOVLPS
+ ? */
+ if (insn[0] == 0x0F
+ && (insn[1] == 0x12 || insn[1] == 0x13)) {
+ Bool is_store = insn[1]==0x13;
+ vg_assert(sz == 4);
+ /* Cannot be used for reg-reg moves, according to Intel docs. */
+ // vg_assert(!epartIsReg(insn[2]));
+ eip = dis_SSE2_load_store_or_mov
+ (cb, sorb, eip+2, 8, is_store, "movlps",
+ insn[0], insn[1] );
+ goto decode_success;
+ }
+
+ /* 0xF3: RCPSS -- reciprocal of scalar float */
+ if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x53) {
+ vg_assert(sz == 4);
+ eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+3, 4,
+ "rcpss",
+ insn[0], insn[1], insn[2] );
+ goto decode_success;
+ }
+
+ /* MOVMSKPD -- extract 2 sign bits from a xmm reg and copy them to
+ an ireg. Top 30 bits of ireg are set to zero. */
+ if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x50) {
+ modrm = insn[2];
+ /* Intel docs don't say anything about a memory source being
+ allowed here. */
+ vg_assert(epartIsReg(modrm));
+ t1 = newTemp(cb);
+ uInstr3(cb, SSE3g_RegWr, 4,
+ Lit16, (((UShort)0x66) << 8) | (UShort)insn[0],
+ Lit16, (((UShort)insn[1]) << 8) | (UShort)modrm,
+ TempReg, t1 );
+ uInstr2(cb, PUT, 4, TempReg, t1, ArchReg, gregOfRM(modrm));
+ if (dis)
+ VG_(printf)("movmskpd %s, %s\n",
+ nameXMMReg(eregOfRM(modrm)),
+ nameIReg(4,gregOfRM(modrm)));
+ eip += 3;
+ goto decode_success;
+ }
+
+ /* ANDNPS */
+ /* 0x66: ANDNPD (src)xmmreg-or-mem, (dst)xmmreg */
+ if (insn[0] == 0x0F && insn[1] == 0x55) {
+ vg_assert(sz == 4 || sz == 2);
+ if (sz == 4) {
+ eip = dis_SSE2_reg_or_mem ( cb, sorb, eip+2, 16, "andnps",
+ insn[0], insn[1] );
+ } else {
+ eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+2, 16, "andnpd",
+ 0x66, insn[0], insn[1] );
+ }
+ goto decode_success;
+ }
+
+ /* MOVHPD -- 8-byte load/store. */
+ if (sz == 2
+ && insn[0] == 0x0F
+ && (insn[1] == 0x16 || insn[1] == 0x17)) {
+ Bool is_store = insn[1]==0x17;
+ /* Cannot be used for reg-reg moves, according to Intel docs. */
+ vg_assert(!epartIsReg(insn[2]));
+ eip = dis_SSE3_load_store_or_mov
+ (cb, sorb, eip+2, 8, is_store, "movhpd",
+ 0x66, insn[0], insn[1] );
+ goto decode_success;
+ }
+
+ /* PMOVMSKB -- extract 16 sign bits from a xmm reg and copy them to
+ an ireg. Top 16 bits of ireg are set to zero. */
+ if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD7) {
+ modrm = insn[2];
+ /* Intel docs don't say anything about a memory source being
+ allowed here. */
+ vg_assert(epartIsReg(modrm));
+ t1 = newTemp(cb);
+ uInstr3(cb, SSE3g_RegWr, 4,
+ Lit16, (((UShort)0x66) << 8) | (UShort)insn[0],
+ Lit16, (((UShort)insn[1]) << 8) | (UShort)modrm,
+ TempReg, t1 );
+ uInstr2(cb, PUT, 4, TempReg, t1, ArchReg, gregOfRM(modrm));
+ if (dis)
+ VG_(printf)("pmovmskb %s, %s\n",
+ nameXMMReg(eregOfRM(modrm)),
+ nameIReg(4,gregOfRM(modrm)));
+ eip += 3;
+ goto decode_success;
+ }
+
+ /* CVTDQ2PD -- convert one single double. to float. */
+ if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xE6) {
+ vg_assert(sz == 4);
+ eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+3, 8, "cvtdq2pd",
+ insn[0], insn[1], insn[2] );
+ goto decode_success;
+ }
+
+ /* SQRTPD: square root of packed double. */
+ if (sz == 2
+ && insn[0] == 0x0F && insn[1] == 0x51) {
+ eip = dis_SSE3_reg_or_mem ( cb, sorb, eip+2, 16,
+ "sqrtpd",
+ 0x66, insn[0], insn[1] );
+ goto decode_success;
+ }
+
/* Fall through into the non-SSE decoder. */
} /* if (VG_(have_ssestate)) */
diff --git a/coregrind/vg_translate.c b/coregrind/vg_translate.c
index 6113df2..d33e0db 100644
--- a/coregrind/vg_translate.c
+++ b/coregrind/vg_translate.c
@@ -405,6 +405,7 @@
# define LIT8 (((u->lit32) & 0xFFFFFF00) == 0)
# define LIT1 (!(LIT0))
# define LITm (u->tag1 == Literal ? True : LIT0 )
+# define SZ16 (u->size == 16)
# define SZ8 (u->size == 8)
# define SZ4 (u->size == 4)
# define SZ2 (u->size == 2)
@@ -569,10 +570,11 @@
case SSE3a_MemRd: return LIT0 && SZsse && CCa && Ls1 && Ls2 && TR3 && XOTHER;
case SSE3e_RegRd: return LIT0 && SZ4 && CC0 && Ls1 && Ls2 && TR3 && XOTHER;
case SSE3e_RegWr: return LIT0 && SZ4 && CC0 && Ls1 && Ls2 && TR3 && XOTHER;
+ case SSE3a1_MemRd: return LIT8 && SZ16 && CC0 && Ls1 && Ls2 && TR3 && XOTHER;
case SSE3g_RegWr: return LIT0 && SZ4 && CC0 && Ls1 && Ls2 && TR3 && XOTHER;
case SSE3g1_RegWr: return LIT8 && SZ4 && CC0 && Ls1 && Ls2 && TR3 && XOTHER;
case SSE3e1_RegRd: return LIT8 && SZ2 && CC0 && Ls1 && Ls2 && TR3 && XOTHER;
- case SSE3: return LIT0 && SZ0 && CC0 && Ls1 && Ls2 && N3 && XOTHER;
+ case SSE3: return LIT0 && SZ0 && CCa && Ls1 && Ls2 && N3 && XOTHER;
case SSE4: return LIT0 && SZ0 && CCa && Ls1 && Ls2 && N3 && XOTHER;
case SSE5: return LIT0 && SZ0 && CC0 && Ls1 && Ls2 && Ls3 && XOTHER;
case SSE3ag_MemRd_RegWr:
@@ -591,6 +593,7 @@
# undef LIT1
# undef LIT8
# undef LITm
+# undef SZ16
# undef SZ8
# undef SZ4
# undef SZ2
@@ -896,6 +899,7 @@
case SSE3e_RegRd: return "SSE3e_RRd";
case SSE3e_RegWr: return "SSE3e_RWr";
case SSE3g_RegWr: return "SSE3g_RWr";
+ case SSE3a1_MemRd: return "SSE3a1_MRd";
case SSE3g1_RegWr: return "SSE3g1_RWr";
case SSE3e1_RegRd: return "SSE3e1_RRd";
case SSE3: return "SSE3";
@@ -1081,6 +1085,7 @@
case SSE3g1_RegWr:
case SSE3e1_RegRd:
+ case SSE3a1_MemRd:
VG_(printf)("0x%x:0x%x:0x%x:0x%x:0x%x",
(u->val1 >> 8) & 0xFF, u->val1 & 0xFF,
(u->val2 >> 8) & 0xFF, u->val2 & 0xFF,
@@ -1260,6 +1265,7 @@
case LEA1: RD(1); WR(2); break;
case LEA2: RD(1); RD(2); WR(3); break;
+ case SSE3a1_MemRd:
case SSE2a1_MemRd:
case SSE3e_RegRd:
case SSE3a_MemWr:
@@ -1432,7 +1438,7 @@
case MMX2_MemRd: case MMX2_MemWr:
case MMX2_ERegRd: case MMX2_ERegWr:
case SSE2a_MemWr: case SSE2a_MemRd: case SSE2a1_MemRd:
- case SSE3a_MemWr: case SSE3a_MemRd:
+ case SSE3a_MemWr: case SSE3a_MemRd: case SSE3a1_MemRd:
case SSE3e_RegRd: case SSE3g_RegWr: case SSE3e_RegWr:
case SSE3g1_RegWr: case SSE3e1_RegRd:
case SSE4: case SSE3: case SSE5: case SSE3ag_MemRd_RegWr:
diff --git a/include/vg_skin.h b/include/vg_skin.h
index ba16315..081bd6c 100644
--- a/include/vg_skin.h
+++ b/include/vg_skin.h
@@ -658,9 +658,7 @@
holding the address. Arg3 holds this Temp/Real Reg.
Transfer is at stated size. */
SSE2a1_MemRd,
-#if 0
- SSE2a1_MemWr,
-#endif
+
/* 4 bytes, writes an integer register. Insns of the form
bbbbbbbb:bbbbbbbb:bbbbbbbb:11 ireg bbb.
Held in val1[15:0] and val2[15:0], and ireg is to be replaced
@@ -718,7 +716,7 @@
/* 5 bytes, no memrefs, no iregdefs, copy exactly to the
output. Held in val1[15:0], val2[15:0] and val3[7:0]. */
SSE5,
-#if 0
+
/* 5 bytes, reads/writes mem. Insns of the form
bbbbbbbb:bbbbbbbb:bbbbbbbb:mod mmxreg r/m:bbbbbbbb
Held in val1[15:0], val2[15:0], lit32[7:0].
@@ -726,8 +724,7 @@
to the Temp/RealReg holding the address. Arg3 holds this
Temp/Real Reg. Transfer is always at size 16. */
SSE3a1_MemRd,
- SSE3a1_MemWr,
-#endif
+
/* ------------------------ */
/* Not strictly needed, but improve address calculation translations. */
diff --git a/memcheck/mc_translate.c b/memcheck/mc_translate.c
index 4d9e0f4..e88f442 100644
--- a/memcheck/mc_translate.c
+++ b/memcheck/mc_translate.c
@@ -1107,7 +1107,8 @@
case SSE3a_MemRd:
case SSE3a_MemWr:
case SSE2a_MemWr:
- case SSE2a_MemRd: {
+ case SSE2a_MemRd:
+ case SSE3a1_MemRd: {
Bool is_load;
Int t_size;
@@ -1116,7 +1117,8 @@
t_size = INVALID_TEMPREG;
is_load = u_in->opcode==SSE2a_MemRd
- || u_in->opcode==SSE3a_MemRd;
+ || u_in->opcode==SSE3a_MemRd
+ || u_in->opcode==SSE3a1_MemRd;
sk_assert(u_in->tag3 == TempReg);