[PowerPC] Emit VMX loads/stores for aligned ops to avoid adding swaps on LE
Fixes PR30730.
This is a re-commit of a pulled commit. The commit was pulled because some
software projects contained uses of Altivec vectors that violated alignment
requirements. Known issues have now been fixed.
Committing on behalf of Lei Huang.
Differential Revision: https://reviews.llvm.org/D26861
llvm-svn: 301892
diff --git a/llvm/test/CodeGen/PowerPC/build-vector-tests.ll b/llvm/test/CodeGen/PowerPC/build-vector-tests.ll
index fa4d212..1bce9d4 100644
--- a/llvm/test/CodeGen/PowerPC/build-vector-tests.ll
+++ b/llvm/test/CodeGen/PowerPC/build-vector-tests.ll
@@ -875,8 +875,8 @@
; P9LE: blr
; P8BE: lxvw4x
; P8BE: blr
-; P8LE: lxvd2x
-; P8LE: xxswapd
+; P8LE: lvx
+; P8LE-NOT: xxswapd
; P8LE: blr
}
@@ -942,8 +942,7 @@
; P8BE: vperm
; P8BE: blr
; P8LE: lxvd2x
-; P8LE-DAG: lxvd2x
-; P8LE-DAG: xxswapd
+; P8LE-DAG: lvx
; P8LE: xxswapd
; P8LE: vperm
; P8LE: blr
@@ -1036,7 +1035,6 @@
; P8LE: sldi {{r[0-9]+}}, r4, 2
; P8LE-DAG: lxvd2x
; P8LE-DAG: lxvd2x
-; P8LE-DAG: xxswapd
; P8LE: xxswapd
; P8LE: vperm
; P8LE: blr
@@ -1289,8 +1287,8 @@
; P9LE: blr
; P8BE: lxvw4x
; P8BE: blr
-; P8LE: lxvd2x
-; P8LE: xxswapd
+; P8LE: lvx
+; P8LE-NOT: xxswapd
; P8LE: blr
}
@@ -1315,7 +1313,7 @@
; P8BE: xvcvspsxws v2, [[REG1]]
; P8BE: blr
; P8LE: lxvd2x [[REG1:[vs0-9]+]], 0, r3
-; P8LE: xxswapd v2, [[REG1]]
+; P8LE: xxswapd
; P8LE: xvcvspsxws v2, v2
; P8LE: blr
}
@@ -1359,8 +1357,7 @@
; P8BE: xvcvspsxws
; P8BE: blr
; P8LE: lxvd2x
-; P8LE-DAG: lxvd2x
-; P8LE-DAG: xxswapd
+; P8LE-DAG: lvx
; P8LE: xxswapd
; P8LE: vperm
; P8LE: xvcvspsxws
@@ -1566,8 +1563,8 @@
; P9LE: blr
; P8BE: lxvw4x
; P8BE: blr
-; P8LE: lxvd2x
-; P8LE: xxswapd
+; P8LE: lvx
+; P8LE-NOT: xxswapd
; P8LE: blr
}
@@ -2036,8 +2033,8 @@
; P9LE: blr
; P8BE: lxvw4x
; P8BE: blr
-; P8LE: lxvd2x
-; P8LE: xxswapd
+; P8LE: lvx
+; P8LE-NOT: xxswapd
; P8LE: blr
}
@@ -2103,8 +2100,8 @@
; P8BE: vperm
; P8BE: blr
; P8LE: lxvd2x
-; P8LE-DAG: lxvd2x
-; P8LE-DAG: xxswapd
+; P8LE-DAG: lvx
+; P8LE-NOT: xxswapd
; P8LE: xxswapd
; P8LE: vperm
; P8LE: blr
@@ -2195,10 +2192,8 @@
; P8BE: vperm
; P8BE: blr
; P8LE-DAG: sldi {{r[0-9]+}}, r4, 2
-; P8LE-DAG: lxvd2x
-; P8LE-DAG: lxvd2x
-; P8LE-DAG: xxswapd
-; P8LE: xxswapd
+; P8LE-DAG: lvx
+; P8LE-DAG: lvx
; P8LE: vperm
; P8LE: blr
}
@@ -2450,8 +2445,8 @@
; P9LE: blr
; P8BE: lxvw4x
; P8BE: blr
-; P8LE: lxvd2x
-; P8LE: xxswapd
+; P8LE: lvx
+; P8LE-NOT: xxswapd
; P8LE: blr
}
@@ -2519,9 +2514,8 @@
; P8BE: vperm
; P8BE: xvcvspuxws
; P8BE: blr
-; P8LE: lxvd2x
; P8LE-DAG: lxvd2x
-; P8LE-DAG: xxswapd
+; P8LE-DAG: lvx
; P8LE: xxswapd
; P8LE: vperm
; P8LE: xvcvspuxws
@@ -2727,8 +2721,8 @@
; P9LE: blr
; P8BE: lxvw4x
; P8BE: blr
-; P8LE: lxvd2x
-; P8LE: xxswapd
+; P8LE: lvx
+; P8LE-NOT: xxswapd
; P8LE: blr
}
diff --git a/llvm/test/CodeGen/PowerPC/ppc64-i128-abi.ll b/llvm/test/CodeGen/PowerPC/ppc64-i128-abi.ll
index 924e04a..4a8fd90 100644
--- a/llvm/test/CodeGen/PowerPC/ppc64-i128-abi.ll
+++ b/llvm/test/CodeGen/PowerPC/ppc64-i128-abi.ll
@@ -1,5 +1,6 @@
; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
-; RUN: -mcpu=pwr8 < %s | FileCheck %s -check-prefix=CHECK-LE
+; RUN: -mcpu=pwr8 < %s | FileCheck %s -check-prefix=CHECK-LE \
+; RUN: --implicit-check-not xxswapd
; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
; RUN: -mcpu=pwr8 < %s | FileCheck %s -check-prefix=CHECK-BE
@@ -8,13 +9,15 @@
; RUN: -mcpu=pwr8 -mattr=-vsx < %s | FileCheck %s -check-prefix=CHECK-NOVSX
; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
-; RUN: -mcpu=pwr8 -mattr=-vsx < %s | FileCheck %s -check-prefix=CHECK-NOVSX
+; RUN: -mcpu=pwr8 -mattr=-vsx < %s | FileCheck %s -check-prefix=CHECK-NOVSX \
+; RUN: --implicit-check-not xxswapd
; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
; RUN: -mcpu=pwr8 -mattr=-vsx < %s | FileCheck %s -check-prefix=CHECK-BE-NOVSX
; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
-; RUN: -mcpu=pwr8 -mattr=-vsx < %s | FileCheck %s -check-prefix=CHECK-LE-NOVSX
+; RUN: -mcpu=pwr8 -mattr=-vsx < %s | \
+; RUN: FileCheck %s -check-prefix=CHECK-LE-NOVSX --implicit-check-not xxswapd
; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
; RUN: -mcpu=pwr9 -ppc-vsr-nums-as-vr < %s | FileCheck %s \
@@ -26,7 +29,7 @@
; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
; RUN: -mcpu=pwr9 -mattr=-power9-vector -mattr=-direct-move < %s | \
-; RUN: FileCheck %s -check-prefix=CHECK-LE
+; RUN: FileCheck %s -check-prefix=CHECK-LE --implicit-check-not xxswapd
@x = common global <1 x i128> zeroinitializer, align 16
@y = common global <1 x i128> zeroinitializer, align 16
@@ -199,8 +202,7 @@
ret <1 x i128> %ret
; CHECK-LE-LABEL: @call_v1i128_increment_by_one
-; CHECK-LE: lxvd2x [[PARAM:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
-; CHECK-LE: xxswapd 34, [[PARAM]]
+; CHECK-LE: lvx 2, {{[0-9]+}}, {{[0-9]+}}
; CHECK-LE: bl v1i128_increment_by_one
; CHECK-LE: blr
@@ -229,10 +231,8 @@
ret <1 x i128> %ret
; CHECK-LE-LABEL: @call_v1i128_increment_by_val
-; CHECK-LE: lxvd2x [[PARAM1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
-; CHECK-LE: lxvd2x [[PARAM2:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
-; CHECK-LE-DAG: xxswapd 34, [[PARAM1]]
-; CHECK-LE-DAG: xxswapd 35, [[PARAM2]]
+; CHECK-LE: lvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK-LE: lvx 3, {{[0-9]+}}, {{[0-9]+}}
; CHECK-LE: bl v1i128_increment_by_val
; CHECK-LE: blr
diff --git a/llvm/test/CodeGen/PowerPC/swaps-le-1.ll b/llvm/test/CodeGen/PowerPC/swaps-le-1.ll
index f3db4f5..7626405 100644
--- a/llvm/test/CodeGen/PowerPC/swaps-le-1.ll
+++ b/llvm/test/CodeGen/PowerPC/swaps-le-1.ll
@@ -13,6 +13,12 @@
; RUN: -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu < %s \
; RUN: | FileCheck -check-prefix=NOOPTSWAP %s
+; LH: 2016-11-17
+; Updated align attritue from 16 to 8 to keep swap instructions tests.
+; Changes have been made on little-endian to use lvx and stvx
+; instructions instead of lxvd2x/xxswapd and xxswapd/stxvd2x for
+; aligned vectors with elements up to 4 bytes
+
; This test was generated from the following source:
;
; #define N 4096
@@ -29,10 +35,10 @@
; }
; }
-@cb = common global [4096 x i32] zeroinitializer, align 16
-@cc = common global [4096 x i32] zeroinitializer, align 16
-@cd = common global [4096 x i32] zeroinitializer, align 16
-@ca = common global [4096 x i32] zeroinitializer, align 16
+@cb = common global [4096 x i32] zeroinitializer, align 8
+@cc = common global [4096 x i32] zeroinitializer, align 8
+@cd = common global [4096 x i32] zeroinitializer, align 8
+@ca = common global [4096 x i32] zeroinitializer, align 8
define void @foo() {
entry:
@@ -42,63 +48,63 @@
%index = phi i64 [ 0, %entry ], [ %index.next.3, %vector.body ]
%0 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cb, i64 0, i64 %index
%1 = bitcast i32* %0 to <4 x i32>*
- %wide.load = load <4 x i32>, <4 x i32>* %1, align 16
+ %wide.load = load <4 x i32>, <4 x i32>* %1, align 8
%2 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cc, i64 0, i64 %index
%3 = bitcast i32* %2 to <4 x i32>*
- %wide.load13 = load <4 x i32>, <4 x i32>* %3, align 16
+ %wide.load13 = load <4 x i32>, <4 x i32>* %3, align 8
%4 = add nsw <4 x i32> %wide.load13, %wide.load
%5 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cd, i64 0, i64 %index
%6 = bitcast i32* %5 to <4 x i32>*
- %wide.load14 = load <4 x i32>, <4 x i32>* %6, align 16
+ %wide.load14 = load <4 x i32>, <4 x i32>* %6, align 8
%7 = mul nsw <4 x i32> %4, %wide.load14
%8 = getelementptr inbounds [4096 x i32], [4096 x i32]* @ca, i64 0, i64 %index
%9 = bitcast i32* %8 to <4 x i32>*
- store <4 x i32> %7, <4 x i32>* %9, align 16
+ store <4 x i32> %7, <4 x i32>* %9, align 8
%index.next = add nuw nsw i64 %index, 4
%10 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cb, i64 0, i64 %index.next
%11 = bitcast i32* %10 to <4 x i32>*
- %wide.load.1 = load <4 x i32>, <4 x i32>* %11, align 16
+ %wide.load.1 = load <4 x i32>, <4 x i32>* %11, align 8
%12 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cc, i64 0, i64 %index.next
%13 = bitcast i32* %12 to <4 x i32>*
- %wide.load13.1 = load <4 x i32>, <4 x i32>* %13, align 16
+ %wide.load13.1 = load <4 x i32>, <4 x i32>* %13, align 8
%14 = add nsw <4 x i32> %wide.load13.1, %wide.load.1
%15 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cd, i64 0, i64 %index.next
%16 = bitcast i32* %15 to <4 x i32>*
- %wide.load14.1 = load <4 x i32>, <4 x i32>* %16, align 16
+ %wide.load14.1 = load <4 x i32>, <4 x i32>* %16, align 8
%17 = mul nsw <4 x i32> %14, %wide.load14.1
%18 = getelementptr inbounds [4096 x i32], [4096 x i32]* @ca, i64 0, i64 %index.next
%19 = bitcast i32* %18 to <4 x i32>*
- store <4 x i32> %17, <4 x i32>* %19, align 16
+ store <4 x i32> %17, <4 x i32>* %19, align 8
%index.next.1 = add nuw nsw i64 %index.next, 4
%20 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cb, i64 0, i64 %index.next.1
%21 = bitcast i32* %20 to <4 x i32>*
- %wide.load.2 = load <4 x i32>, <4 x i32>* %21, align 16
+ %wide.load.2 = load <4 x i32>, <4 x i32>* %21, align 8
%22 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cc, i64 0, i64 %index.next.1
%23 = bitcast i32* %22 to <4 x i32>*
- %wide.load13.2 = load <4 x i32>, <4 x i32>* %23, align 16
+ %wide.load13.2 = load <4 x i32>, <4 x i32>* %23, align 8
%24 = add nsw <4 x i32> %wide.load13.2, %wide.load.2
%25 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cd, i64 0, i64 %index.next.1
%26 = bitcast i32* %25 to <4 x i32>*
- %wide.load14.2 = load <4 x i32>, <4 x i32>* %26, align 16
+ %wide.load14.2 = load <4 x i32>, <4 x i32>* %26, align 8
%27 = mul nsw <4 x i32> %24, %wide.load14.2
%28 = getelementptr inbounds [4096 x i32], [4096 x i32]* @ca, i64 0, i64 %index.next.1
%29 = bitcast i32* %28 to <4 x i32>*
- store <4 x i32> %27, <4 x i32>* %29, align 16
+ store <4 x i32> %27, <4 x i32>* %29, align 8
%index.next.2 = add nuw nsw i64 %index.next.1, 4
%30 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cb, i64 0, i64 %index.next.2
%31 = bitcast i32* %30 to <4 x i32>*
- %wide.load.3 = load <4 x i32>, <4 x i32>* %31, align 16
+ %wide.load.3 = load <4 x i32>, <4 x i32>* %31, align 8
%32 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cc, i64 0, i64 %index.next.2
%33 = bitcast i32* %32 to <4 x i32>*
- %wide.load13.3 = load <4 x i32>, <4 x i32>* %33, align 16
+ %wide.load13.3 = load <4 x i32>, <4 x i32>* %33, align 8
%34 = add nsw <4 x i32> %wide.load13.3, %wide.load.3
%35 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cd, i64 0, i64 %index.next.2
%36 = bitcast i32* %35 to <4 x i32>*
- %wide.load14.3 = load <4 x i32>, <4 x i32>* %36, align 16
+ %wide.load14.3 = load <4 x i32>, <4 x i32>* %36, align 8
%37 = mul nsw <4 x i32> %34, %wide.load14.3
%38 = getelementptr inbounds [4096 x i32], [4096 x i32]* @ca, i64 0, i64 %index.next.2
%39 = bitcast i32* %38 to <4 x i32>*
- store <4 x i32> %37, <4 x i32>* %39, align 16
+ store <4 x i32> %37, <4 x i32>* %39, align 8
%index.next.3 = add nuw nsw i64 %index.next.2, 4
%40 = icmp eq i64 %index.next.3, 4096
br i1 %40, label %for.end, label %vector.body
diff --git a/llvm/test/CodeGen/PowerPC/swaps-le-2.ll b/llvm/test/CodeGen/PowerPC/swaps-le-2.ll
index 0963b92..e7751a1 100644
--- a/llvm/test/CodeGen/PowerPC/swaps-le-2.ll
+++ b/llvm/test/CodeGen/PowerPC/swaps-le-2.ll
@@ -2,6 +2,13 @@
; Test swap removal when a vector splat must be adjusted to make it legal.
;
+
+; LH: 2016-11-17
+; Updated align attritue from 16 to 8 to keep swap instructions tests.
+; Changes have been made on little-endian to use lvx and stvx
+; instructions instead of lxvd2x/xxswapd and xxswapd/stxvd2x for
+; aligned vectors with elements up to 4 bytes
+
; Test generated from following C code:
;
; vector char vc = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
@@ -28,37 +35,37 @@
; vir = (vector int){vi[1], vi[1], vi[1], vi[1]};
; }
-@vc = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
-@vs = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
-@vi = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
-@vcr = common global <16 x i8> zeroinitializer, align 16
-@vsr = common global <8 x i16> zeroinitializer, align 16
-@vir = common global <4 x i32> zeroinitializer, align 16
+@vc = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 8
+@vs = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 8
+@vi = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 8
+@vcr = common global <16 x i8> zeroinitializer, align 8
+@vsr = common global <8 x i16> zeroinitializer, align 8
+@vir = common global <4 x i32> zeroinitializer, align 8
; Function Attrs: nounwind
define void @cfoo() {
entry:
- %0 = load <16 x i8>, <16 x i8>* @vc, align 16
+ %0 = load <16 x i8>, <16 x i8>* @vc, align 8
%vecinit30 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
- store <16 x i8> %vecinit30, <16 x i8>* @vcr, align 16
+ store <16 x i8> %vecinit30, <16 x i8>* @vcr, align 8
ret void
}
; Function Attrs: nounwind
define void @sfoo() {
entry:
- %0 = load <8 x i16>, <8 x i16>* @vs, align 16
+ %0 = load <8 x i16>, <8 x i16>* @vs, align 8
%vecinit14 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
- store <8 x i16> %vecinit14, <8 x i16>* @vsr, align 16
+ store <8 x i16> %vecinit14, <8 x i16>* @vsr, align 8
ret void
}
; Function Attrs: nounwind
define void @ifoo() {
entry:
- %0 = load <4 x i32>, <4 x i32>* @vi, align 16
+ %0 = load <4 x i32>, <4 x i32>* @vi, align 8
%vecinit6 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
- store <4 x i32> %vecinit6, <4 x i32>* @vir, align 16
+ store <4 x i32> %vecinit6, <4 x i32>* @vir, align 8
ret void
}
diff --git a/llvm/test/CodeGen/PowerPC/vsx-ldst.ll b/llvm/test/CodeGen/PowerPC/vsx-ldst.ll
index a146182..d8dd635 100644
--- a/llvm/test/CodeGen/PowerPC/vsx-ldst.ll
+++ b/llvm/test/CodeGen/PowerPC/vsx-ldst.ll
@@ -14,8 +14,10 @@
; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mattr=+vsx -O2 \
; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s > %t
-; RUN: grep lxvd2x < %t | count 6
-; RUN: grep stxvd2x < %t | count 6
+; RUN: grep lxvd2x < %t | count 3
+; RUN: grep lvx < %t | count 3
+; RUN: grep stxvd2x < %t | count 3
+; RUN: grep stvx < %t | count 3
; RUN: llc -verify-machineinstrs -mcpu=pwr9 -O2 \
; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s > %t
diff --git a/llvm/test/CodeGen/PowerPC/vsx.ll b/llvm/test/CodeGen/PowerPC/vsx.ll
index a5dd494..cfea3e5 100644
--- a/llvm/test/CodeGen/PowerPC/vsx.ll
+++ b/llvm/test/CodeGen/PowerPC/vsx.ll
@@ -645,8 +645,8 @@
; CHECK-FISL: blr
; CHECK-LE-LABEL: @test32
-; CHECK-LE: lxvd2x [[V1:[0-9]+]], 0, 3
-; CHECK-LE: xxswapd 34, [[V1]]
+; CHECK-LE: lvx 2, 0, 3
+; CHECK-LE-NOT: xxswapd
; CHECK-LE: blr
}
@@ -663,8 +663,8 @@
; CHECK-FISL: blr
; CHECK-LE-LABEL: @test33
-; CHECK-LE: xxswapd [[V1:[0-9]+]], 34
-; CHECK-LE: stxvd2x [[V1]], 0, 3
+; CHECK-LE-NOT: xxswapd
+; CHECK-LE: stvx 2, 0, 3
; CHECK-LE: blr
}
@@ -716,8 +716,8 @@
; CHECK-FISL: blr
; CHECK-LE-LABEL: @test34
-; CHECK-LE: lxvd2x [[V1:[0-9]+]], 0, 3
-; CHECK-LE: xxswapd 34, [[V1]]
+; CHECK-LE: lvx 2, 0, 3
+; CHECK-LE-NOT: xxswapd
; CHECK-LE: blr
}
@@ -734,8 +734,8 @@
; CHECK-FISL: blr
; CHECK-LE-LABEL: @test35
-; CHECK-LE: xxswapd [[V1:[0-9]+]], 34
-; CHECK-LE: stxvd2x [[V1]], 0, 3
+; CHECK-LE-NOT: xxswapd
+; CHECK-LE: stvx 2, 0, 3
; CHECK-LE: blr
}
@@ -1150,9 +1150,9 @@
; CHECK-LE-DAG: mtvsrd [[R1:[0-9]+]], 3
; CHECK-LE-DAG: xxswapd [[V1:[0-9]+]], [[R1]]
; CHECK-LE-DAG: addi [[R2:[0-9]+]], {{[0-9]+}}, .LCPI
-; CHECK-LE-DAG: lxvd2x [[V2:[0-9]+]], 0, [[R2]]
+; CHECK-LE-DAG: lvx 3, 0, [[R2]]
; CHECK-LE-DAG: xxspltw 34, [[V1]]
-; CHECK-LE-DAG: xxswapd 35, [[V2]]
+; CHECK-LE-NOT: xxswapd 35, [[V2]]
; CHECK-LE: vadduwm 2, 2, 3
; CHECK-LE: blr
}