Add various %rflag-helper specialisation cases and fast paths.  This
more or less doubles performance of the baseline simulation on integer
code.



git-svn-id: svn://svn.valgrind.org/vex/trunk@1143 8f6e269a-dfd6-0310-a8e1-e2731360e62c
diff --git a/priv/guest-amd64/ghelpers.c b/priv/guest-amd64/ghelpers.c
index e306565..3a8de8a 100644
--- a/priv/guest-amd64/ghelpers.c
+++ b/priv/guest-amd64/ghelpers.c
@@ -525,12 +525,14 @@
    for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
 
       ch = ' ';
-      if (op > 0 && (op-1) % 3 == 0) 
+      if (op > 0 && (op-1) % 4 == 0) 
          ch = 'B';
-      if (op > 0 && (op-1) % 3 == 1) 
+      if (op > 0 && (op-1) % 4 == 1) 
          ch = 'W';
-      if (op > 0 && (op-1) % 3 == 2) 
+      if (op > 0 && (op-1) % 4 == 2) 
          ch = 'L';
+      if (op > 0 && (op-1) % 4 == 3) 
+         ch = 'Q';
 
       vex_printf("%2d%c: ", op, ch);
       vex_printf("%6u ", tabc_slow[op]);
@@ -694,24 +696,23 @@
 
    /* Fast-case some common ones. */
    switch (cc_op) {
-#  if 0 // REINSTATE CAREFULLY
+      case AMD64G_CC_OP_LOGICQ: 
       case AMD64G_CC_OP_LOGICL: 
       case AMD64G_CC_OP_LOGICW: 
       case AMD64G_CC_OP_LOGICB:
          return 0;
-      case AMD64G_CC_OP_SUBL:
-         return ((UInt)cc_dep1) < ((UInt)cc_dep2)
-                   ? AMD64G_CC_MASK_C : 0;
-      case AMD64G_CC_OP_SUBW:
-         return ((UInt)(cc_dep1 & 0xFFFF)) < ((UInt)(cc_dep2 & 0xFFFF))
-                   ? AMD64G_CC_MASK_C : 0;
-      case AMD64G_CC_OP_SUBB:
-         return ((UInt)(cc_dep1 & 0xFF)) < ((UInt)(cc_dep2 & 0xFF))
-                   ? AMD64G_CC_MASK_C : 0;
-      case AMD64G_CC_OP_INCL:
-      case AMD64G_CC_OP_DECL:
-         return cc_ndep & AMD64G_CC_MASK_C;
-#  endif // REINSTATE CAREFULLY
+	 //      case AMD64G_CC_OP_SUBL:
+	 //         return ((UInt)cc_dep1) < ((UInt)cc_dep2)
+	 //                   ? AMD64G_CC_MASK_C : 0;
+	 //      case AMD64G_CC_OP_SUBW:
+	 //         return ((UInt)(cc_dep1 & 0xFFFF)) < ((UInt)(cc_dep2 & 0xFFFF))
+	 //                   ? AMD64G_CC_MASK_C : 0;
+	 //      case AMD64G_CC_OP_SUBB:
+	 //         return ((UInt)(cc_dep1 & 0xFF)) < ((UInt)(cc_dep2 & 0xFF))
+	 //                   ? AMD64G_CC_MASK_C : 0;
+	 //      case AMD64G_CC_OP_INCL:
+	 //      case AMD64G_CC_OP_DECL:
+	 //         return cc_ndep & AMD64G_CC_MASK_C;
       default: 
          break;
    }
@@ -830,48 +831,48 @@
 /*--- %rflags functions.                                      ---*/
 /*---------------------------------------------------------------*/
 
-//.. /* Used by the optimiser to try specialisations.  Returns an
-//..    equivalent expression, or NULL if none. */
-//.. 
-//.. static Bool isU32 ( IRExpr* e, UInt n )
-//.. {
-//..    return e->tag == Iex_Const
-//..           && e->Iex.Const.con->tag == Ico_U32
-//..           && e->Iex.Const.con->Ico.U32 == n;
-//.. }
+/* Used by the optimiser to try specialisations.  Returns an
+   equivalent expression, or NULL if none. */
+
+static Bool isU64 ( IRExpr* e, ULong n )
+{
+   return e->tag == Iex_Const
+          && e->Iex.Const.con->tag == Ico_U64
+          && e->Iex.Const.con->Ico.U64 == n;
+}
 
 IRExpr* guest_amd64_spechelper ( HChar* function_name,
                                  IRExpr** args )
 {
-//.. #  define unop(_op,_a1) IRExpr_Unop((_op),(_a1))
-//.. #  define binop(_op,_a1,_a2) IRExpr_Binop((_op),(_a1),(_a2))
-//.. #  define mkU32(_n) IRExpr_Const(IRConst_U32(_n))
-//.. #  define mkU8(_n)  IRExpr_Const(IRConst_U8(_n))
-//.. 
-//..    Int i, arity = 0;
-//..    for (i = 0; args[i]; i++)
-//..       arity++;
-//.. #  if 0
-//..    vex_printf("spec request:\n");
-//..    vex_printf("   %s  ", function_name);
-//..    for (i = 0; i < arity; i++) {
-//..       vex_printf("  ");
-//..       ppIRExpr(args[i]);
-//..    }
-//..    vex_printf("\n");
-//.. #  endif
-//.. 
-//..    /* --------- specialising "x86g_calculate_condition" --------- */
-//.. 
-//..    if (vex_streq(function_name, "x86g_calculate_condition")) {
-//..       /* specialise calls to above "calculate condition" function */
-//..       IRExpr *cond, *cc_op, *cc_dep1, *cc_dep2;
-//..       vassert(arity == 5);
-//..       cond    = args[0];
-//..       cc_op   = args[1];
-//..       cc_dep1 = args[2];
-//..       cc_dep2 = args[3];
-//.. 
+#  define unop(_op,_a1) IRExpr_Unop((_op),(_a1))
+#  define binop(_op,_a1,_a2) IRExpr_Binop((_op),(_a1),(_a2))
+#  define mkU64(_n) IRExpr_Const(IRConst_U64(_n))
+#  define mkU8(_n)  IRExpr_Const(IRConst_U8(_n))
+
+   Int i, arity = 0;
+   for (i = 0; args[i]; i++)
+      arity++;
+#  if 0
+   vex_printf("spec request:\n");
+   vex_printf("   %s  ", function_name);
+   for (i = 0; i < arity; i++) {
+      vex_printf("  ");
+      ppIRExpr(args[i]);
+   }
+   vex_printf("\n");
+#  endif
+
+   /* --------- specialising "amd64g_calculate_condition" --------- */
+
+   if (vex_streq(function_name, "amd64g_calculate_condition")) {
+      /* specialise calls to above "calculate condition" function */
+      IRExpr *cond, *cc_op, *cc_dep1, *cc_dep2;
+      vassert(arity == 5);
+      cond    = args[0];
+      cc_op   = args[1];
+      cc_dep1 = args[2];
+      cc_dep2 = args[3];
+
 //..       /*---------------- ADDL ----------------*/
 //.. 
 //..       if (isU32(cc_op, AMD64G_CC_OP_ADDL) && isU32(cond, X86CondZ)) {
@@ -881,9 +882,9 @@
 //..                            binop(Iop_Add32, cc_dep1, cc_dep2),
 //..                            mkU32(0)));
 //..       }
-//.. 
-//..       /*---------------- SUBL ----------------*/
-//.. 
+
+      /*---------------- SUBL ----------------*/
+
 //..       if (isU32(cc_op, AMD64G_CC_OP_SUBL) && isU32(cond, X86CondZ)) {
 //..          /* long sub/cmp, then Z --> test dst==src */
 //..          return unop(Iop_1Uto32,
@@ -895,21 +896,30 @@
 //..          return unop(Iop_1Uto32,
 //..                      binop(Iop_CmpNE32, cc_dep1, cc_dep2));
 //..       }
-//.. 
-//..       if (isU32(cc_op, AMD64G_CC_OP_SUBL) && isU32(cond, X86CondL)) {
-//..          /* long sub/cmp, then L (signed less than) 
-//..             --> test dst <s src */
-//..          return unop(Iop_1Uto32,
-//..                      binop(Iop_CmpLT32S, cc_dep1, cc_dep2));
-//..       }
-//.. 
-//..       if (isU32(cc_op, AMD64G_CC_OP_SUBL) && isU32(cond, X86CondLE)) {
-//..          /* long sub/cmp, then LE (signed less than or equal)
-//..             --> test dst <=s src */
-//..          return unop(Iop_1Uto32,
-//..                      binop(Iop_CmpLE32S, cc_dep1, cc_dep2));
-//..       }
-//.. 
+
+      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondL)) {
+         /* long sub/cmp, then L (signed less than) 
+            --> test dst <s src */
+         return unop(Iop_32Uto64,
+                unop(Iop_1Uto32,
+                     binop(Iop_CmpLT64S, 
+                           binop(Iop_Shl64,cc_dep1,mkU8(32)),
+                           binop(Iop_Shl64,cc_dep2,mkU8(32)))));
+
+      }
+
+      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondLE)) {
+         /* long sub/cmp, then L (signed less than or equal) 
+            --> test dst <s src */
+         return unop(Iop_32Uto64,
+                unop(Iop_1Uto32,
+                     binop(Iop_CmpLE64S, 
+                           binop(Iop_Shl64,cc_dep1,mkU8(32)),
+                           binop(Iop_Shl64,cc_dep2,mkU8(32)))));
+
+      }
+
+
 //..       if (isU32(cc_op, AMD64G_CC_OP_SUBL) && isU32(cond, X86CondBE)) {
 //..          /* long sub/cmp, then BE (unsigned less than or equal)
 //..             --> test dst <=u src */
@@ -923,35 +933,38 @@
 //..          return unop(Iop_1Uto32,
 //..                      binop(Iop_CmpLT32U, cc_dep1, cc_dep2));
 //..       }
-//.. 
-//..       /*---------------- SUBW ----------------*/
-//.. 
-//..       if (isU32(cc_op, AMD64G_CC_OP_SUBW) && isU32(cond, X86CondZ)) {
-//..          /* byte sub/cmp, then Z --> test dst==src */
-//..          return unop(Iop_1Uto32,
-//..                      binop(Iop_CmpEQ16, 
-//..                            unop(Iop_32to16,cc_dep1), 
-//..                            unop(Iop_32to16,cc_dep2)));
-//..       }
-//.. 
-//..       /*---------------- SUBB ----------------*/
-//.. 
-//..       if (isU32(cc_op, AMD64G_CC_OP_SUBB) && isU32(cond, X86CondZ)) {
-//..          /* byte sub/cmp, then Z --> test dst==src */
-//..          return unop(Iop_1Uto32,
-//..                      binop(Iop_CmpEQ8, 
-//..                            unop(Iop_32to8,cc_dep1), 
-//..                            unop(Iop_32to8,cc_dep2)));
-//..       }
-//.. 
-//..       if (isU32(cc_op, AMD64G_CC_OP_SUBB) && isU32(cond, X86CondNZ)) {
-//..          /* byte sub/cmp, then NZ --> test dst!=src */
-//..          return unop(Iop_1Uto32,
-//..                      binop(Iop_CmpNE8, 
-//..                            unop(Iop_32to8,cc_dep1), 
-//..                            unop(Iop_32to8,cc_dep2)));
-//..       }
-//.. 
+
+      /*---------------- SUBW ----------------*/
+
+      if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondZ)) {
+         /* word sub/cmp, then Z --> test dst==src */
+         return unop(Iop_32Uto64,
+                unop(Iop_1Uto32,
+                     binop(Iop_CmpEQ16, 
+                           unop(Iop_32to16,unop(Iop_64to32,cc_dep1)),
+                           unop(Iop_32to16,unop(Iop_64to32,cc_dep2)))));
+      }
+
+      /*---------------- SUBB ----------------*/
+
+      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondZ)) {
+         /* byte sub/cmp, then Z --> test dst==src */
+         return unop(Iop_32Uto64,
+                unop(Iop_1Uto32,
+                     binop(Iop_CmpEQ8, 
+                           unop(Iop_32to8,unop(Iop_64to32,cc_dep1)),
+                           unop(Iop_32to8,unop(Iop_64to32,cc_dep2)))));
+      }
+
+//      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNZ)) {
+//         /* byte sub/cmp, then NZ --> test dst!=src */
+//         return unop(Iop_32Uto64,
+//                unop(Iop_1Uto32,
+//                     binop(Iop_CmpNE8, 
+//                           unop(Iop_32to8,unop(Iop_64to32,cc_dep1)),
+//                           unop(Iop_32to8,unop(Iop_64to32,cc_dep2)))));
+//      }
+
 //..       if (isU32(cc_op, AMD64G_CC_OP_SUBB) && isU32(cond, X86CondNBE)) {
 //..          /* long sub/cmp, then NBE (unsigned greater than)
 //..             --> test src <=u dst */
@@ -961,29 +974,35 @@
 //..                            binop(Iop_And32,cc_dep2,mkU32(0xFF)),
 //.. 			   binop(Iop_And32,cc_dep1,mkU32(0xFF))));
 //..       }
-//.. 
-//..       /*---------------- LOGICL ----------------*/
-//.. 
-//..       if (isU32(cc_op, AMD64G_CC_OP_LOGICL) && isU32(cond, X86CondZ)) {
-//..          /* long and/or/xor, then Z --> test dst==0 */
-//..          return unop(Iop_1Uto32,binop(Iop_CmpEQ32, cc_dep1, mkU32(0)));
-//..       }
-//.. 
+
+      /*---------------- LOGICL ----------------*/
+
+      if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondZ)) {
+         /* long and/or/xor, then Z --> test dst==0 */
+         return unop(Iop_32Uto64,
+                unop(Iop_1Uto32,binop(Iop_CmpEQ64, 
+                                      binop(Iop_Shl64,cc_dep1,mkU8(32)), 
+                                      mkU64(0))));
+      }
+
 //..       if (isU32(cc_op, AMD64G_CC_OP_LOGICL) && isU32(cond, X86CondS)) {
 //..          /* long and/or/xor, then S --> test dst <s 0 */
 //..          return unop(Iop_1Uto32,binop(Iop_CmpLT32S, cc_dep1, mkU32(0)));
 //..       }
-//.. 
-//..       if (isU32(cc_op, AMD64G_CC_OP_LOGICL) && isU32(cond, X86CondLE)) {
-//..          /* long and/or/xor, then LE
-//..             This is pretty subtle.  LOGIC sets SF and ZF according to the
-//..             result and makes OF be zero.  LE computes (SZ ^ OF) | ZF, but
-//..             OF is zero, so this reduces to SZ | ZF -- which will be 1 iff
-//..             the result is <=signed 0.  Hence ...
-//..          */
-//..          return unop(Iop_1Uto32,binop(Iop_CmpLE32S, cc_dep1, mkU32(0)));
-//..       }
-//.. 
+
+      if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondLE)) {
+         /* long and/or/xor, then LE
+            This is pretty subtle.  LOGIC sets SF and ZF according to the
+            result and makes OF be zero.  LE computes (SZ ^ OF) | ZF, but
+            OF is zero, so this reduces to SZ | ZF -- which will be 1 iff
+            the result is <=signed 0.  Hence ...
+         */
+         return unop(Iop_32Uto64,
+                unop(Iop_1Uto32,binop(Iop_CmpLE64S, 
+                                      binop(Iop_Shl64,cc_dep1,mkU8(32)), 
+                                      mkU64(0))));
+      }
+
 //..       if (isU32(cc_op, AMD64G_CC_OP_LOGICL) && isU32(cond, X86CondBE)) {
 //..          /* long and/or/xor, then BE
 //..             LOGIC sets ZF according to the result and makes CF be zero.
@@ -1076,43 +1095,49 @@
 //..                )
 //..             );
 //..       }
-//.. 
-//..       return NULL;
-//..    }
-//.. 
-//..    /* --------- specialising "x86g_calculate_eflags_c" --------- */
-//.. 
-//..    if (vex_streq(function_name, "x86g_calculate_eflags_c")) {
-//..       /* specialise calls to above "calculate_eflags_c" function */
-//..       IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
-//..       vassert(arity == 4);
-//..       cc_op   = args[0];
-//..       cc_dep1 = args[1];
-//..       cc_dep2 = args[2];
-//..       cc_ndep = args[3];
-//.. 
-//..       if (isU32(cc_op, AMD64G_CC_OP_SUBL)) {
-//..          /* C after sub denotes unsigned less than */
-//..          return unop(Iop_1Uto32,
-//..                      binop(Iop_CmpLT32U, cc_dep1, cc_dep2));
-//..       }
-//..       if (isU32(cc_op, AMD64G_CC_OP_SUBB)) {
-//..          /* C after sub denotes unsigned less than */
-//..          return unop(Iop_1Uto32,
-//..                      binop(Iop_CmpLT32U, 
-//..                            binop(Iop_And32,cc_dep1,mkU32(0xFF)),
-//..                            binop(Iop_And32,cc_dep2,mkU32(0xFF))));
-//..       }
-//..       if (isU32(cc_op, AMD64G_CC_OP_LOGICL)
-//..           || isU32(cc_op, AMD64G_CC_OP_LOGICW)
-//..           || isU32(cc_op, AMD64G_CC_OP_LOGICB)) {
-//..          /* cflag after logic is zero */
-//..          return mkU32(0);
-//..       }
-//..       if (isU32(cc_op, AMD64G_CC_OP_DECL) || isU32(cc_op, AMD64G_CC_OP_INCL)) {
-//..          /* If the thunk is dec or inc, the cflag is supplied as CC_NDEP. */
-//..          return cc_ndep;
-//..       }
+
+      return NULL;
+   }
+
+   /* --------- specialising "amd64g_calculate_rflags_c" --------- */
+
+   if (vex_streq(function_name, "amd64g_calculate_rflags_c")) {
+      /* specialise calls to above "calculate_rflags_c" function */
+      IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
+      vassert(arity == 4);
+      cc_op   = args[0];
+      cc_dep1 = args[1];
+      cc_dep2 = args[2];
+      cc_ndep = args[3];
+
+      if (isU64(cc_op, AMD64G_CC_OP_SUBL)) {
+         /* C after sub denotes unsigned less than */
+         return unop(Iop_32Uto64,
+                unop(Iop_1Uto32,
+                     binop(Iop_CmpLT64U, 
+                           binop(Iop_Shl64,cc_dep1,mkU8(32)), 
+                           binop(Iop_Shl64,cc_dep2,mkU8(32)))));
+      }
+      if (isU64(cc_op, AMD64G_CC_OP_SUBB)) {
+         /* C after sub denotes unsigned less than */
+         return unop(Iop_32Uto64,
+                unop(Iop_1Uto32,
+                     binop(Iop_CmpLT64U, 
+                           binop(Iop_And64,cc_dep1,mkU64(0xFF)),
+                           binop(Iop_And64,cc_dep2,mkU64(0xFF)))));
+      }
+      if (isU64(cc_op, AMD64G_CC_OP_LOGICQ)
+          || isU64(cc_op, AMD64G_CC_OP_LOGICL)
+          || isU64(cc_op, AMD64G_CC_OP_LOGICW)
+          || isU64(cc_op, AMD64G_CC_OP_LOGICB)) {
+         /* cflag after logic is zero */
+         return mkU64(0);
+      }
+      if (isU64(cc_op, AMD64G_CC_OP_DECL) || isU64(cc_op, AMD64G_CC_OP_INCL)
+          || isU64(cc_op, AMD64G_CC_OP_DECQ) || isU64(cc_op, AMD64G_CC_OP_INCQ)) {
+         /* If the thunk is dec or inc, the cflag is supplied as CC_NDEP. */
+         return cc_ndep;
+      }
 //..       if (isU32(cc_op, AMD64G_CC_OP_COPY)) {
 //..          /* cflag after COPY is stored in DEP1. */
 //..          return
@@ -1127,14 +1152,14 @@
 //..          vex_printf("CFLAG "); ppIRExpr(cc_op); vex_printf("\n");
 //..       }
 //.. #     endif
+
+      return NULL;
+   }
+
+//..    /* --------- specialising "x86g_calculate_rflags_all" --------- */
 //.. 
-//..       return NULL;
-//..    }
-//.. 
-//..    /* --------- specialising "x86g_calculate_eflags_all" --------- */
-//.. 
-//..    if (vex_streq(function_name, "x86g_calculate_eflags_all")) {
-//..       /* specialise calls to above "calculate_eflags_all" function */
+//..    if (vex_streq(function_name, "x86g_calculate_rflags_all")) {
+//..       /* specialise calls to above "calculate_rflags_all" function */
 //..       IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
 //..       vassert(arity == 4);
 //..       cc_op   = args[0];
@@ -1153,11 +1178,11 @@
 //..             );
 //..       }
 //..       return NULL;
-//..    }
+//..   }
 
 #  undef unop
 #  undef binop
-#  undef mkU32
+#  undef mkU64
 #  undef mkU8
 
    return NULL;