blob: 6069c14f0d800f9d5d3a604222a459956e33074c [file] [log] [blame]
Filipe Cabecinhasdc921022014-05-19 19:45:57 +00001; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=corei7-avx | FileCheck %s -check-prefix=X32 --check-prefix=CHECK
2; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s -check-prefix=X64 --check-prefix=CHECK
Filipe Cabecinhase1555182014-05-16 22:47:49 +00003
4define <4 x i32> @blendvb_fallback_v4i32(<4 x i1> %mask, <4 x i32> %x, <4 x i32> %y) {
5; CHECK-LABEL: @blendvb_fallback_v4i32
Chandler Carruthaaf8e032014-09-30 02:52:28 +00006; CHECK: vblendvps
Filipe Cabecinhase1555182014-05-16 22:47:49 +00007; CHECK: ret
8 %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %y
9 ret <4 x i32> %ret
10}
11
12define <8 x i32> @blendvb_fallback_v8i32(<8 x i1> %mask, <8 x i32> %x, <8 x i32> %y) {
13; CHECK-LABEL: @blendvb_fallback_v8i32
14; CHECK: vblendvps
15; CHECK: ret
16 %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
17 ret <8 x i32> %ret
18}
19
20define <8 x float> @blendvb_fallback_v8f32(<8 x i1> %mask, <8 x float> %x, <8 x float> %y) {
21; CHECK-LABEL: @blendvb_fallback_v8f32
22; CHECK: vblendvps
23; CHECK: ret
24 %ret = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y
25 ret <8 x float> %ret
26}
Filipe Cabecinhasdc921022014-05-19 19:45:57 +000027
28declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
29
30define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
31; CHECK-LABEL: insertps_from_vector_load:
32; On X32, account for the argument's move to registers
33; X32: movl 4(%esp), %eax
34; CHECK-NOT: mov
35; CHECK: insertps $48
36; CHECK-NEXT: ret
37 %1 = load <4 x float>* %pb, align 16
38 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
39 ret <4 x float> %2
40}
41
42;; Use a non-zero CountS for insertps
43define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
44; CHECK-LABEL: insertps_from_vector_load_offset:
45; On X32, account for the argument's move to registers
46; X32: movl 4(%esp), %eax
47; CHECK-NOT: mov
48;; Try to match a bit more of the instr, since we need the load's offset.
49; CHECK: insertps $96, 4(%{{...}}), %
50; CHECK-NEXT: ret
51 %1 = load <4 x float>* %pb, align 16
52 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
53 ret <4 x float> %2
54}
55
56define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
57; CHECK-LABEL: insertps_from_vector_load_offset_2:
58; On X32, account for the argument's move to registers
59; X32: movl 4(%esp), %eax
60; X32: movl 8(%esp), %ecx
61; CHECK-NOT: mov
62;; Try to match a bit more of the instr, since we need the load's offset.
Craig Topper0271d102015-01-23 08:00:59 +000063; CHECK: vinsertps $192, 12(%{{...}},%{{...}}), %
Filipe Cabecinhasdc921022014-05-19 19:45:57 +000064; CHECK-NEXT: ret
65 %1 = getelementptr inbounds <4 x float>* %pb, i64 %index
66 %2 = load <4 x float>* %1, align 16
67 %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
68 ret <4 x float> %3
69}
70
71define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
72; CHECK-LABEL: insertps_from_broadcast_loadf32:
73; On X32, account for the arguments' move to registers
74; X32: movl 8(%esp), %eax
75; X32: movl 4(%esp), %ecx
76; CHECK-NOT: mov
77; CHECK: insertps $48
78; CHECK-NEXT: ret
79 %1 = getelementptr inbounds float* %fb, i64 %index
80 %2 = load float* %1, align 4
81 %3 = insertelement <4 x float> undef, float %2, i32 0
82 %4 = insertelement <4 x float> %3, float %2, i32 1
83 %5 = insertelement <4 x float> %4, float %2, i32 2
84 %6 = insertelement <4 x float> %5, float %2, i32 3
85 %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
86 ret <4 x float> %7
87}
88
89define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
90; CHECK-LABEL: insertps_from_broadcast_loadv4f32:
91; On X32, account for the arguments' move to registers
92; X32: movl 4(%esp), %{{...}}
93; CHECK-NOT: mov
94; CHECK: insertps $48
95; CHECK-NEXT: ret
96 %1 = load <4 x float>* %b, align 4
97 %2 = extractelement <4 x float> %1, i32 0
98 %3 = insertelement <4 x float> undef, float %2, i32 0
99 %4 = insertelement <4 x float> %3, float %2, i32 1
100 %5 = insertelement <4 x float> %4, float %2, i32 2
101 %6 = insertelement <4 x float> %5, float %2, i32 3
102 %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
103 ret <4 x float> %7
104}
105
106;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
107define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
108; CHECK-LABEL: insertps_from_broadcast_multiple_use:
109; On X32, account for the arguments' move to registers
110; X32: movl 8(%esp), %eax
111; X32: movl 4(%esp), %ecx
112; CHECK: vbroadcastss
113; CHECK-NOT: mov
114; CHECK: insertps $48
115; CHECK: insertps $48
116; CHECK: insertps $48
117; CHECK: insertps $48
118; CHECK: vaddps
119; CHECK: vaddps
120; CHECK: vaddps
121; CHECK-NEXT: ret
122 %1 = getelementptr inbounds float* %fb, i64 %index
123 %2 = load float* %1, align 4
124 %3 = insertelement <4 x float> undef, float %2, i32 0
125 %4 = insertelement <4 x float> %3, float %2, i32 1
126 %5 = insertelement <4 x float> %4, float %2, i32 2
127 %6 = insertelement <4 x float> %5, float %2, i32 3
128 %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
129 %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48)
130 %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48)
131 %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48)
132 %11 = fadd <4 x float> %7, %8
133 %12 = fadd <4 x float> %9, %10
134 %13 = fadd <4 x float> %11, %12
135 ret <4 x float> %13
136}