Build system and hwcaps fixes pertaining to #305728, which added
support for AVX2, BMI1, BMI2 and FMA instructions.
(Jakub Jelinek, jakub@redhat.com)



git-svn-id: svn://svn.valgrind.org/valgrind/trunk@13340 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/Makefile.vex.am b/Makefile.vex.am
index ddb5374..419a580 100644
--- a/Makefile.vex.am
+++ b/Makefile.vex.am
@@ -46,6 +46,8 @@
 	priv/host_generic_regs.h \
 	priv/host_generic_simd64.h \
 	priv/host_generic_simd128.h \
+	priv/host_generic_simd256.h \
+	priv/host_generic_maddf.h \
 	priv/host_x86_defs.h \
 	priv/host_amd64_defs.h \
 	priv/host_ppc_defs.h \
@@ -117,6 +119,8 @@
 	priv/host_generic_regs.c \
 	priv/host_generic_simd64.c \
 	priv/host_generic_simd128.c \
+	priv/host_generic_simd256.c \
+	priv/host_generic_maddf.c \
 	priv/host_generic_reg_alloc2.c \
 	priv/host_x86_defs.c \
 	priv/host_x86_isel.c \
diff --git a/configure.in b/configure.in
index 9aadfa2..747ccd4 100644
--- a/configure.in
+++ b/configure.in
@@ -1909,6 +1909,77 @@
 AM_CONDITIONAL(BUILD_AVX_TESTS, test x$ac_have_as_avx = xyes)
 
 
+# does the x86/amd64 assembler understand AVX2 instructions?
+# Note, this doesn't generate a C-level symbol.  It generates a
+# automake-level symbol (BUILD_AVX2_TESTS), used in test Makefile.am's
+AC_MSG_CHECKING([if x86/amd64 assembler speaks AVX2])
+
+AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[]], [[
+  do { long long int x; 
+   __asm__ __volatile__(
+      "vpsravd (%%rsp), %%ymm8, %%ymm7" : : : "xmm7", "xmm8" );
+   __asm__ __volatile__(
+      "vpaddb %%ymm6,%%ymm7,%%ymm8" : : : "xmm6","xmm7","xmm8"); }
+  while (0)
+]])], [
+ac_have_as_avx2=yes
+AC_MSG_RESULT([yes])
+], [
+ac_have_as_avx2=no
+AC_MSG_RESULT([no])
+])
+
+AM_CONDITIONAL(BUILD_AVX2_TESTS, test x$ac_have_as_avx2 = xyes)
+
+
+# does the x86/amd64 assembler understand BMI1 and BMI2 instructions?
+# Note, this doesn't generate a C-level symbol.  It generates a
+# automake-level symbol (BUILD_BMI_TESTS), used in test Makefile.am's
+AC_MSG_CHECKING([if x86/amd64 assembler speaks BMI1 and BMI2])
+
+AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[]], [[
+  do { unsigned int h, l;
+   __asm__ __volatile__(
+      "andn %2, %1, %0" : "=r" (h) : "r" (0x1234567), "r" (0x7654321) ); }
+   __asm__ __volatile__(
+      "movl %2, %%edx; mulx %3, %1, %0" : "=r" (h), "=r" (l) : "g" (0x1234567), "g" (0x7654321) : "edx" ); }
+  while (0)
+]])], [
+ac_have_as_bmi=yes
+AC_MSG_RESULT([yes])
+], [
+ac_have_as_bmi=no
+AC_MSG_RESULT([no])
+])
+
+AM_CONDITIONAL(BUILD_BMI_TESTS, test x$ac_have_as_bmi = xyes)
+
+
+# does the x86/amd64 assembler understand FMA instructions?
+# Note, this doesn't generate a C-level symbol.  It generates a
+# automake-level symbol (BUILD_FMA_TESTS), used in test Makefile.am's
+AC_MSG_CHECKING([if x86/amd64 assembler speaks FMA])
+
+AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[]], [[
+  do { unsigned int h, l;
+   __asm__ __volatile__(
+      "vfmadd132ps (%%rsp), %%ymm8, %%ymm7" : : : "xmm7", "xmm8" );
+   __asm__ __volatile__(
+      "vfnmsub231sd (%%rsp), %%xmm8, %%xmm7" : : : "xmm7", "xmm8" );
+   __asm__ __volatile__(
+      "vfmsubadd213pd (%%rsp), %%xmm8, %%xmm7" : : : "xmm7", "xmm8" ); }
+  while (0)
+]])], [
+ac_have_as_fma=yes
+AC_MSG_RESULT([yes])
+], [
+ac_have_as_fma=no
+AC_MSG_RESULT([no])
+])
+
+AM_CONDITIONAL(BUILD_FMA_TESTS, test x$ac_have_as_fma = xyes)
+
+
 # does the x86/amd64 assembler understand MOVBE?
 # Note, this doesn't generate a C-level symbol.  It generates a
 # automake-level symbol (BUILD_MOVBE_TESTS), used in test Makefile.am's
diff --git a/coregrind/m_machine.c b/coregrind/m_machine.c
index dce7ee6..7ba19c9 100644
--- a/coregrind/m_machine.c
+++ b/coregrind/m_machine.c
@@ -767,9 +767,9 @@
 
 #elif defined(VGA_amd64)
    { Bool have_sse3, have_cx8, have_cx16;
-     Bool have_lzcnt, have_avx /*, have_fma*/;
+     Bool have_lzcnt, have_avx, have_bmi, have_avx2;
      Bool have_rdtscp;
-     UInt eax, ebx, ecx, edx, max_extended;
+     UInt eax, ebx, ecx, edx, max_basic, max_extended;
      HChar vstr[13];
      vstr[0] = 0;
 
@@ -778,7 +778,8 @@
         return False;
 
      VG_(cpuid)(0, 0, &eax, &ebx, &ecx, &edx);
-     if (eax < 1)
+     max_basic = eax;
+     if (max_basic < 1)
         /* we can't ask for cpuid(x) for x > 0.  Give up. */
         return False;
 
@@ -835,13 +836,13 @@
      /* on amd64 we tolerate older cpus, which don't have cmpxchg16b */
      have_cx16 = (ecx & (1<<13)) != 0; /* True => have cmpxchg16b */
 
-     /* Figure out if this is an AMD that can do LZCNT. */
+     /* Figure out if this CPU can do LZCNT. */
      have_lzcnt = False;
-     if (0 == VG_(strcmp)(vstr, "AuthenticAMD")
-         && max_extended >= 0x80000001) {
+     if (max_extended >= 0x80000001) {
         VG_(cpuid)(0x80000001, 0, &eax, &ebx, &ecx, &edx);
         have_lzcnt = (ecx & (1<<5)) != 0; /* True => have LZCNT */
      }
+
      /* Can we do RDTSCP? */
      have_rdtscp = False;
      if (max_extended >= 0x80000001) {
@@ -849,11 +850,22 @@
         have_rdtscp = (edx & (1<<27)) != 0; /* True => have RDTSVCP */
      }
 
+     /* Check for BMI1 and AVX2. */
+     have_bmi = False;
+     have_avx2 = False;
+     if (max_basic >= 7) {
+        VG_(cpuid)(7, 0, &eax, &ebx, &ecx, &edx);
+        have_bmi = (ebx & (1<<3)) != 0; /* True => have BMI1 */
+        have_avx2 = have_avx && ((ebx & (1<<5)) != 0); /* True => have AVX2 */
+     }
+
      va         = VexArchAMD64;
      vai.hwcaps = (have_sse3   ? VEX_HWCAPS_AMD64_SSE3   : 0)
                 | (have_cx16   ? VEX_HWCAPS_AMD64_CX16   : 0)
                 | (have_lzcnt  ? VEX_HWCAPS_AMD64_LZCNT  : 0)
                 | (have_avx    ? VEX_HWCAPS_AMD64_AVX    : 0)
+                | (have_bmi    ? VEX_HWCAPS_AMD64_BMI    : 0)
+                | (have_avx2   ? VEX_HWCAPS_AMD64_AVX2   : 0)
                 | (have_rdtscp ? VEX_HWCAPS_AMD64_RDTSCP : 0);
 
      VG_(machine_get_cache_info)(&vai);