[OPENMP]Simplify codegen for the outlined regions.

Simplified codegen for the outlined regions, excluding duplication code
for handling variables with the reference types.

llvm-svn: 361529
diff --git a/clang/test/OpenMP/distribute_firstprivate_codegen.cpp b/clang/test/OpenMP/distribute_firstprivate_codegen.cpp
index d7fdd1c..245d8a5 100644
--- a/clang/test/OpenMP/distribute_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_firstprivate_codegen.cpp
@@ -86,6 +86,7 @@
       // LAMBDA: [[SVAR_ADDR:%.+]] = alloca i{{[0-9]+}}*,
       // LAMBDA: [[SFVAR_ADDR:%.+]] = alloca float*,
       // LAMBDA: [[G1_REF:%.+]] = alloca double*,
+      // LAMBDA: [[G1_REF1:%.+]] = alloca double*,
 
       // Actual private variables to be used in the body (tmp is used for the reference type)
       // LAMBDA: [[G_PRIVATE:%.+]] = alloca double,
@@ -105,9 +106,11 @@
       // LAMBDA-DAG: [[SFVAR_ADDR_VAL:%.+]] = load float*, float** [[SFVAR_ADDR]],
       // LAMBDA-DAG: [[G1_ADDR_VAL:%.+]] = load double*, double** [[G1_ADDR]],
       // LAMBDA-DAG: store double* [[G1_ADDR_VAL]], double** [[G1_REF]],
+      // LAMBDA-DAG: [[G1_ADDR_VAL:%.+]] = load double*, double** [[G1_REF]],
+      // LAMBDA-DAG: store double* [[G1_ADDR_VAL]], double** [[G1_REF1]],
       // LAMBDA-DAG: [[G_CONV_VAL:%.+]] = load{{.*}} double, double* [[G_ADDR_VAL]],
       // LAMBDA-DAG: store double [[G_CONV_VAL]], double* [[G_PRIVATE]],
-      // LAMBDA-DAG: [[TMP_VAL:%.+]] = load double*, double** [[G1_REF]],
+      // LAMBDA-DAG: [[TMP_VAL:%.+]] = load double*, double** [[G1_REF1]],
       // LAMBDA-DAG: [[TMP_VAL_VAL:%.+]] = load{{.*}} double, double* [[TMP_VAL]],
       // LAMBDA-DAG: store double [[TMP_VAL_VAL]], double* [[G1_PRIVATE]],
       // LAMBDA-DAG: store double* [[G1_PRIVATE]], double** [[TMP_PRIVATE]],
@@ -224,6 +227,7 @@
 // CHECK: [[VAR_ADDR:%.+]] = alloca [[S_FLOAT_TY]]*,
 // CHECK: [[SVAR_ADDR:%.+]] = alloca i{{[0-9]+}}*,
 // CHECK: [[TMP:%.+]] = alloca [[S_FLOAT_TY]]*,
+// CHECK: [[TMP1:%.+]] = alloca [[S_FLOAT_TY]]*,
 
 // discard omp loop variables
 // CHECK: {{.*}} = alloca i{{[0-9]+}},
@@ -280,7 +284,9 @@
 // init var
 // CHECK-DAG: [[VAR_ADDR_VAL:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[VAR_ADDR]],
 // CHECK-DAG: store{{.+}} [[VAR_ADDR_VAL]],{{.+}} [[TMP]],
-// CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP]],
+// CHECK-DAG: [[VAR_ADDR_VAL:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP]],
+// CHECK-DAG: store{{.+}} [[VAR_ADDR_VAL]],{{.+}} [[TMP1]],
+// CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP1]],
 // CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_PRIV]] to{{.+}}
 // CHECK-DAG: [[TMP_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL]] to{{.+}}
 // CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[TMP_BCAST]],{{.+}})
@@ -321,6 +327,7 @@
 // CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
 // CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
 // CHECK: [[TMP:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: [[TMP1:%.+]] = alloca [[S_INT_TY]]*,
 
 // discard omp loop variables
 // CHECK: {{.*}} = alloca i{{[0-9]+}},
@@ -375,7 +382,9 @@
 // init var
 // CHECK-DAG: [[VAR_ADDR_VAL:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[VAR_ADDR]],
 // CHECK-DAG: store{{.+}} [[VAR_ADDR_VAL]],{{.+}} [[TMP]],
-// CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP]],
+// CHECK-DAG: [[VAR_ADDR_VAL:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP]],
+// CHECK-DAG: store{{.+}} [[VAR_ADDR_VAL]],{{.+}} [[TMP1]],
+// CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP1]],
 // CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_PRIV]] to{{.+}}
 // CHECK-DAG: [[TMP_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL]] to{{.+}}
 // CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[TMP_BCAST]],{{.+}})
diff --git a/clang/test/OpenMP/distribute_lastprivate_codegen.cpp b/clang/test/OpenMP/distribute_lastprivate_codegen.cpp
index 175f3c8..8d6ceb6 100644
--- a/clang/test/OpenMP/distribute_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_lastprivate_codegen.cpp
@@ -85,6 +85,7 @@
       // LAMBDA: [[SVAR_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}}*,
       // LAMBDA: [[SFVAR_PRIVATE_ADDR:%.+]] = alloca float*,
       // LAMBDA: [[TMP_G1:%.+]] = alloca double*,
+      // LAMBDA: [[TMP1_G1:%.+]] = alloca double*,
       // loop variables
       // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
       // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
@@ -104,11 +105,13 @@
 
       // init private variables
       // LAMBDA: [[G_IN_REF:%.+]] = load double*, double** [[G_PRIVATE_ADDR]],
+      // LAMBDA: [[G1_IN_REF:%.+]] = load double*, double** [[G1_PRIVATE_ADDR]],
       // LAMBDA: [[SVAR_IN_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_PRIVATE_ADDR]],
       // LAMBDA: [[SFVAR_IN_REF:%.+]] = load float*, float** [[SFVAR_PRIVATE_ADDR]],
-      // LAMBDA: [[G1_IN_REF:%.+]] = load double*, double** [[G1_PRIVATE_ADDR]],
       // LAMBDA: store double* [[G1_IN_REF]], double** [[TMP_G1]],
-      // LAMBDA: [[TMP_G1_VAL:%.+]] = load double*, double** [[TMP_G1]],
+      // LAMBDA: [[G1_IN_REF:%.+]] = load double*, double** [[TMP_G1]],
+      // LAMBDA: store double* [[G1_IN_REF]], double** [[TMP1_G1]],
+      // LAMBDA: [[TMP_G1_VAL:%.+]] = load double*, double** [[TMP1_G1]],
       // LAMBDA: store double* [[G1_PRIVATE]], double** [[TMP_G1_PRIVATE]],
       g = 1;
       g1 = 1;
@@ -214,6 +217,7 @@
 // CHECK: [[VAR_ADDR:%.+]] = alloca [[S_FLOAT_TY]]*,
 // CHECK: [[SVAR_ADDR:%.+]] = alloca i{{[0-9]+}}*,
 // CHECK: [[TMP:%.*]] = alloca [[S_FLOAT_TY]]*,
+// CHECK: [[TMP1:%.*]] = alloca [[S_FLOAT_TY]]*,
 // skip loop variables
 // CHECK: {{.+}} = alloca i{{[0-9]+}},
 // CHECK: {{.+}} = alloca i{{[0-9]+}},
@@ -239,11 +243,13 @@
 // CHECK: [[T_VAR_ADDR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR_ADDR]],
 // CHECK: [[VEC_ADDR_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
 // CHECK: [[S_ARR_ADDR_REF:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[S_ARR_ADDR]],
-// CHECK: [[SVAR_ADDR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_ADDR]],
 // CHECK: [[VAR_ADDR_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[VAR_ADDR]],
+// CHECK: [[SVAR_ADDR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_ADDR]],
 // CHECK: store [[S_FLOAT_TY]]* [[VAR_ADDR_REF]], [[S_FLOAT_TY]]** [[TMP]],
+// CHECK: [[VAR_ADDR_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP]],
+// CHECK: store [[S_FLOAT_TY]]* [[VAR_ADDR_REF]], [[S_FLOAT_TY]]** [[TMP1]],
 // CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[OMP_IS_LAST]],
-// CHECK: [[TMP_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP]],
+// CHECK: [[TMP_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP1]],
 // the distribute loop
 // CHECK: call void @__kmpc_for_static_init_4(
 // assignment: vec[i] = t_var;
@@ -316,6 +322,7 @@
 // CHECK: [[S_ARR_ADDR1:%.+]] = alloca [2 x [[S_INT_TY]]]*,
 // CHECK: [[VAR_ADDR1:%.+]] = alloca [[S_INT_TY]]*,
 // CHECK: [[TMP:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: [[TMP1:%.+]] = alloca [[S_INT_TY]]*,
 // skip loop variables
 // CHECK: {{.+}} = alloca i{{[0-9]+}},
 // CHECK: {{.+}} = alloca i{{[0-9]+}},
@@ -344,8 +351,10 @@
 // CHECK: [[S_ARR_ADDR_REF1:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_ADDR1]],
 // CHECK: [[VAR_ADDR1_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[VAR_ADDR1]],
 // CHECK-DAG: store [[S_INT_TY]]* [[VAR_ADDR1_REF]], [[S_INT_TY]]** [[TMP]],
+// CHECK-DAG: [[VAR_ADDR1_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP]],
+// CHECK-DAG: store [[S_INT_TY]]* [[VAR_ADDR1_REF]], [[S_INT_TY]]** [[TMP1]],
 // CHECK-DAG: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[OMP_IS_LAST1]],
-// CHECK-DAG: [[TMP_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP]],
+// CHECK-DAG: [[TMP_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP1]],
 // CHECK: call void @__kmpc_for_static_init_4(
 // assignment: vec[i] = t_var;
 // CHECK: [[IV_VAL1:%.+]] =
diff --git a/clang/test/OpenMP/distribute_parallel_for_firstprivate_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_firstprivate_codegen.cpp
index 7054182..9dc38b3 100644
--- a/clang/test/OpenMP/distribute_parallel_for_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_firstprivate_codegen.cpp
@@ -86,6 +86,7 @@
       // LAMBDA: [[SVAR_ADDR:%.+]] = alloca i{{[0-9]+}}*,
       // LAMBDA: [[SFVAR_ADDR:%.+]] = alloca float*,
       // LAMBDA: [[G1_REF:%.+]] = alloca double*,
+      // LAMBDA: [[G1_REF1:%.+]] = alloca double*,
 
       // private alloca's
       // LAMBDA: [[G_PRIV:%.+]] = alloca double,
@@ -111,7 +112,7 @@
       // LAMBDA-DAG: store {{.+}} [[G_ADDR_VAL]], {{.+}}* [[G_PRIV]],
 
       // g1
-      // LAMBDA-DAG: [[TMP_REF:%.+]] = load {{.+}}*, {{.+}}** [[G1_REF]],
+      // LAMBDA-DAG: [[TMP_REF:%.+]] = load {{.+}}*, {{.+}}** [[G1_REF1]],
       // LAMBDA-DAG: [[TMP_VAL:%.+]] = load {{.+}}, {{.+}}* [[TMP_REF]],
       // LAMBDA-DAG: store {{.+}} [[TMP_VAL]], {{.+}}* [[G1_PRIV]]
       // LAMBDA-DAG: store {{.+}}* [[G1_PRIV]], {{.+}}** [[TMP_PRIV]],
@@ -293,6 +294,7 @@
 // CHECK: [[VAR_ADDR:%.+]] = alloca [[S_FLOAT_TY]]*,
 // CHECK: [[SVAR_ADDR:%.+]] = alloca i{{[0-9]+}}*,
 // CHECK: [[TMP:%.+]] = alloca [[S_FLOAT_TY]]*,
+// CHECK: [[TMP1:%.+]] = alloca [[S_FLOAT_TY]]*,
 
 // skip loop alloca's
 // CHECK: [[OMP_IV:.omp.iv+]] = alloca i{{[0-9]+}},
@@ -342,7 +344,7 @@
 // CHECK-DAG: [[CPY_DONE]]:
 
 // var
-// CHECK-DAG: [[TMP_REF:%.+]] = load {{.+}}*, {{.+}}* [[TMP]],
+// CHECK-DAG: [[TMP_REF:%.+]] = load {{.+}}*, {{.+}}* [[TMP1]],
 // CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast {{.+}}* [[VAR_PRIV]] to
 // CHECK-DAG: [[TMP_REF_BCAST:%.+]] = bitcast {{.+}}* [[TMP_REF]] to
 // CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[TMP_REF_BCAST]],
@@ -393,6 +395,7 @@
 // CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_FLOAT_TY]]]*,
 // CHECK: [[VAR_ADDR:%.+]] = alloca [[S_FLOAT_TY]]*,
 // CHECK: [[SVAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[TMP:%.+]] = alloca [[S_FLOAT_TY]]*,
 
 // skip loop alloca's
 // CHECK: [[OMP_IV:.omp.iv+]] = alloca i{{[0-9]+}},
@@ -435,7 +438,7 @@
 // CHECK-DAG: [[CPY_DONE]]:
 
 // var
-// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load {{.+}}*, {{.+}}* [[VAR_ADDR]],
+// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load {{.+}}*, {{.+}}* [[TMP]],
 // CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast {{.+}}* [[VAR_PRIV]] to
 // CHECK-DAG: [[VAR_ADDR_BCAST:%.+]] = bitcast {{.+}}* [[VAR_ADDR_REF]] to
 // CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[VAR_ADDR_BCAST]],
@@ -477,6 +480,7 @@
 // CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
 // CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
 // CHECK: [[TMP:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: [[TMP1:%.+]] = alloca [[S_INT_TY]]*,
 
 // skip loop alloca's
 // CHECK: [[OMP_IV:.omp.iv+]] = alloca i{{[0-9]+}},
@@ -524,7 +528,7 @@
 // CHECK-DAG: [[CPY_DONE]]:
 
 // var
-// CHECK-DAG: [[TMP_REF:%.+]] = load {{.+}}*, {{.+}}* [[TMP]],
+// CHECK-DAG: [[TMP_REF:%.+]] = load {{.+}}*, {{.+}}* [[TMP1]],
 // CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast {{.+}}* [[VAR_PRIV]] to
 // CHECK-DAG: [[TMP_REF_BCAST:%.+]] = bitcast {{.+}}* [[TMP_REF]] to
 // CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[TMP_REF_BCAST]],
@@ -564,6 +568,7 @@
 // CHECK: [[T_VAR_ADDR:%.+]] = alloca i{{[0-9]+}},
 // CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
 // CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: [[TMP:%.+]] = alloca [[S_INT_TY]]*,
 
 // skip loop alloca's
 // CHECK: [[OMP_IV:.omp.iv+]] = alloca i{{[0-9]+}},
@@ -605,7 +610,7 @@
 // CHECK-DAG: [[CPY_DONE]]:
 
 // var
-// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load {{.+}}*, {{.+}}* [[VAR_ADDR]],
+// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load {{.+}}*, {{.+}}* [[TMP]],
 // CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast {{.+}}* [[VAR_PRIV]] to
 // CHECK-DAG: [[VAR_ADDR_BCAST:%.+]] = bitcast {{.+}}* [[VAR_ADDR_REF]] to
 // CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[VAR_ADDR_BCAST]],
diff --git a/clang/test/OpenMP/distribute_parallel_for_lastprivate_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_lastprivate_codegen.cpp
index b10c02b..20b233f 100644
--- a/clang/test/OpenMP/distribute_parallel_for_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_lastprivate_codegen.cpp
@@ -106,9 +106,9 @@
 
       // init private variables
       // LAMBDA: [[G_IN_REF:%.+]] = load double*, double** [[G_PRIVATE_ADDR]],
+      // LAMBDA: [[G1_IN_REF:%.+]] = load double*, double** [[G1_PRIVATE_ADDR]],
       // LAMBDA: [[SVAR_IN_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_PRIVATE_ADDR]],
       // LAMBDA: [[SFVAR_IN_REF:%.+]] = load float*, float** [[SFVAR_PRIVATE_ADDR]],
-      // LAMBDA: [[G1_IN_REF:%.+]] = load double*, double** [[G1_PRIVATE_ADDR]],
       // LAMBDA: store double* [[G1_IN_REF]], double** [[TMP_G1]],
       // LAMBDA: [[TMP_G1_VAL:%.+]] = load double*, double** [[TMP_G1]],
       // LAMBDA: store double* [[G1_PRIVATE]], double** [[TMP_G1_PRIVATE]],
@@ -129,7 +129,7 @@
       // LAMBDA: store{{.*}} double [[G_PRIV_VAL]], double* [[G_IN_REF]],
       // LAMBDA: [[TMP_G1_PRIV_REF:%.+]] = load double*, double** [[TMP_G1_PRIVATE]],
       // LAMBDA: [[TMP_G1_PRIV_VAL:%.+]] = load double, double* [[TMP_G1_PRIV_REF]],
-      // LAMBDA: store{{.*}} double [[TMP_G1_PRIV_VAL]], double* [[TMP_G1_VAL]],
+      // LAMBDA: store{{.*}} double [[TMP_G1_PRIV_VAL]], double* %
 
       // LAMBDA: [[SVAR_PRIV_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SVAR_PRIVATE]],
       // LAMBDA: store i{{[0-9]+}} [[SVAR_PRIV_VAL]], i{{[0-9]+}}* [[SVAR_IN_REF]],
@@ -176,10 +176,10 @@
 
       // init private variables
       // LAMBDA: [[G_IN_REF:%.+]] = load double*, double** [[G_PRIVATE_ADDR]],
+      // LAMBDA: [[G1_IN_REF:%.+]] = load double*, double** [[G1_PRIVATE_ADDR]],
       // LAMBDA: [[SVAR_IN_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_PRIVATE_ADDR]],
       // LAMBDA: [[SFVAR_IN_REF:%.+]] = load float*, float** [[SFVAR_PRIVATE_ADDR]],
 
-      // LAMBDA: [[G1_IN_REF:%.+]] = load double*, double** [[G1_PRIVATE_ADDR]],
       // LAMBDA: store double* [[G1_PRIVATE]], double** [[TMP_G1_PRIVATE]],
 
       // LAMBDA: call {{.*}}void @__kmpc_for_static_init_4(
@@ -212,7 +212,7 @@
       // LAMBDA: store{{.*}} double [[G_PRIV_VAL]], double* [[G_IN_REF]],
       // LAMBDA: [[TMP_G1_PRIV_REF:%.+]] = load double*, double** [[TMP_G1_PRIVATE]],
       // LAMBDA: [[TMP_G1_PRIV_VAL:%.+]] = load double, double* [[TMP_G1_PRIV_REF]],
-      // LAMBDA: store{{.*}} double [[TMP_G1_PRIV_VAL]], double* [[G1_IN_REF]],
+      // LAMBDA: store{{.*}} double [[TMP_G1_PRIV_VAL]], double* %
       // LAMBDA: [[SVAR_PRIV_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SVAR_PRIVATE]],
       // LAMBDA: store i{{[0-9]+}} [[SVAR_PRIV_VAL]], i{{[0-9]+}}* [[SVAR_IN_REF]],
       // LAMBDA: [[SFVAR_PRIV_VAL:%.+]] = load float, float* [[SFVAR_PRIVATE]],
@@ -311,8 +311,8 @@
 // CHECK: [[T_VAR_ADDR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR_ADDR]],
 // CHECK: [[VEC_ADDR_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
 // CHECK: [[S_ARR_ADDR_REF:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[S_ARR_ADDR]],
-// CHECK: [[SVAR_ADDR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_ADDR]],
 // CHECK: [[VAR_ADDR_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[VAR_ADDR]],
+// CHECK: [[SVAR_ADDR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_ADDR]],
 // CHECK: store [[S_FLOAT_TY]]* [[VAR_ADDR_REF]], [[S_FLOAT_TY]]** [[TMP]],
 // CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[OMP_IS_LAST]],
 
@@ -327,7 +327,7 @@
 // CHECK: [[S_ARR_DONE:%.+]] = icmp {{.+}} [[S_ARR_NEXT]], [[S_ARR_END]]
 // CHECK: br i1 [[S_ARR_DONE]], label %[[S_ARR_CST_END:.+]], label %[[S_ARR_CST_LOOP]]
 // CHECK: [[S_ARR_CST_END]]:
-// CHECK: [[TMP_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP]],
+// CHECK: [[TMP_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %
 // CHECK: call void [[S_FLOAT_TY_DEF_CONSTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
 // CHECK: store [[S_FLOAT_TY]]* [[VAR_PRIV]], [[S_FLOAT_TY]]** [[TMP_PRIV]],
 
@@ -424,7 +424,7 @@
 // CHECK: [[S_ARR_DONE:%.+]] = icmp {{.+}} [[S_ARR_NEXT]], [[S_ARR_END]]
 // CHECK: br i1 [[S_ARR_DONE]], label %[[S_ARR_CST_END:.+]], label %[[S_ARR_CST_LOOP]]
 // CHECK: [[S_ARR_CST_END]]:
-// CHECK: [[VAR_ADDR_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[VAR_ADDR]],
+// CHECK: [[VAR_ADDR_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %
 // CHECK: call void [[S_FLOAT_TY_DEF_CONSTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
 // CHECK: store [[S_FLOAT_TY]]* [[VAR_PRIV]], [[S_FLOAT_TY]]** [[TMP_PRIV]],
 
@@ -567,7 +567,7 @@
 // CHECK: br i1 [[CPY_IS_FINISHED]], label %[[S_ARR_COPY_DONE]], label %[[S_ARR_COPY_BLOCK]]
 // CHECK: [[S_ARR_COPY_DONE]]:
 // CHECK: [[TMP_VAL:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV]],
-// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_REF]] to i8*
+// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* %{{.+}} to i8*
 // CHECK: [[TMP_VAL_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL]] to i8*
 // CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST]],{{.+}})
 // CHECK: ret void
@@ -605,6 +605,7 @@
 // CHECK: [[VEC_ADDR_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
 // CHECK: [[T_VAR_ADDR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR_ADDR]],
 // CHECK: [[S_ARR_ADDR_REF:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_ADDR]],
+// CHECK: [[VAR_ADDR_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[VAR_ADDR]],
 // CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[OMP_IS_LAST]],
 
 // call constructor for s_arr
@@ -618,7 +619,6 @@
 // CHECK: [[S_ARR_DONE:%.+]] = icmp {{.+}} [[S_ARR_NEXT]], [[S_ARR_END]]
 // CHECK: br i1 [[S_ARR_DONE]], label %[[S_ARR_CST_END:.+]], label %[[S_ARR_CST_LOOP]]
 // CHECK: [[S_ARR_CST_END]]:
-// CHECK: [[VAR_ADDR_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[VAR_ADDR]],
 // CHECK: call void [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]* [[VAR_PRIV]])
 // CHECK: store [[S_INT_TY]]* [[VAR_PRIV]], [[S_INT_TY]]** [[TMP_PRIV]],
 
@@ -667,7 +667,7 @@
 // CHECK: br i1 [[CPY_IS_FINISHED]], label %[[S_ARR_COPY_DONE]], label %[[S_ARR_COPY_BLOCK]]
 // CHECK: [[S_ARR_COPY_DONE]]:
 // CHECK: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV]],
-// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_ADDR_REF]] to i8*
+// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* %{{.+}} to i8*
 // CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
 // CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
 // CHECK: ret void
diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_firstprivate_codegen.cpp
index 50e45be..62e2c8c 100644
--- a/clang/test/OpenMP/distribute_parallel_for_simd_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_simd_firstprivate_codegen.cpp
@@ -108,6 +108,8 @@
 
       // g1
       // LAMBDA-DAG: [[TMP_REF:%.+]] = load {{.+}}*, {{.+}}** [[G1_REF]],
+      // LAMBDA-DAG: store {{.+}} [[TMP_REF]], {{.+}}* [[TMP:%.+]],
+      // LAMBDA-DAG: [[TMP_REF:%.+]] = load {{.+}}*, {{.+}}** [[TMP]],
       // LAMBDA-DAG: [[TMP_VAL:%.+]] = load {{.+}}, {{.+}}* [[TMP_REF]],
       // LAMBDA-DAG: store {{.+}} [[TMP_VAL]], {{.+}}* [[G1_PRIV]]
       // LAMBDA-DAG: store {{.+}}* [[G1_PRIV]], {{.+}}** [[TMP_PRIV]],
@@ -341,6 +343,8 @@
 
 // var
 // CHECK-DAG: [[TMP_REF:%.+]] = load {{.+}}*, {{.+}}* [[TMP]],
+// CHECK-DAG: store {{.+}} [[TMP_REF]], {{.+}}* [[TMP1:%.+]],
+// CHECK-DAG: [[TMP_REF:%.+]] = load {{.+}}*, {{.+}}** [[TMP1]],
 // CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast {{.+}}* [[VAR_PRIV]] to
 // CHECK-DAG: [[TMP_REF_BCAST:%.+]] = bitcast {{.+}}* [[TMP_REF]] to
 // CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[TMP_REF_BCAST]],
@@ -434,6 +438,8 @@
 
 // var
 // CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load {{.+}}*, {{.+}}* [[VAR_ADDR]],
+// CHECK-DAG: store {{.+}} [[VAR_ADDR_REF]], {{.+}}* [[TMP1:%.+]],
+// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load {{.+}}*, {{.+}}** [[TMP1]],
 // CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast {{.+}}* [[VAR_PRIV]] to
 // CHECK-DAG: [[VAR_ADDR_BCAST:%.+]] = bitcast {{.+}}* [[VAR_ADDR_REF]] to
 // CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[VAR_ADDR_BCAST]],
@@ -523,6 +529,8 @@
 
 // var
 // CHECK-DAG: [[TMP_REF:%.+]] = load {{.+}}*, {{.+}}* [[TMP]],
+// CHECK-DAG: store {{.+}} [[TMP_REF]], {{.+}}* [[TMP1:%.+]],
+// CHECK-DAG: [[TMP_REF:%.+]] = load {{.+}}*, {{.+}}** [[TMP1]],
 // CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast {{.+}}* [[VAR_PRIV]] to
 // CHECK-DAG: [[TMP_REF_BCAST:%.+]] = bitcast {{.+}}* [[TMP_REF]] to
 // CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[TMP_REF_BCAST]],
@@ -604,6 +612,8 @@
 
 // var
 // CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load {{.+}}*, {{.+}}* [[VAR_ADDR]],
+// CHECK-DAG: store {{.+}} [[VAR_ADDR_REF]], {{.+}}* [[TMP1:%.+]],
+// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load {{.+}}*, {{.+}}** [[TMP1]],
 // CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast {{.+}}* [[VAR_PRIV]] to
 // CHECK-DAG: [[VAR_ADDR_BCAST:%.+]] = bitcast {{.+}}* [[VAR_ADDR_REF]] to
 // CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[VAR_ADDR_BCAST]],
diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_lastprivate_codegen.cpp
index c79bcee..516e9c0 100644
--- a/clang/test/OpenMP/distribute_parallel_for_simd_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_simd_lastprivate_codegen.cpp
@@ -106,9 +106,9 @@
 
       // init private variables
       // LAMBDA: [[G_IN_REF:%.+]] = load double*, double** [[G_PRIVATE_ADDR]],
+      // LAMBDA: [[G1_IN_REF:%.+]] = load double*, double** [[G1_PRIVATE_ADDR]],
       // LAMBDA: [[SVAR_IN_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_PRIVATE_ADDR]],
       // LAMBDA: [[SFVAR_IN_REF:%.+]] = load float*, float** [[SFVAR_PRIVATE_ADDR]],
-      // LAMBDA: [[G1_IN_REF:%.+]] = load double*, double** [[G1_PRIVATE_ADDR]],
       // LAMBDA: store double* [[G1_IN_REF]], double** [[TMP_G1]],
       // LAMBDA: [[TMP_G1_VAL:%.+]] = load double*, double** [[TMP_G1]],
       // LAMBDA: store double* [[G1_PRIVATE]], double** [[TMP_G1_PRIVATE]],
@@ -138,7 +138,7 @@
       // LAMBDA: store{{.*}} double [[G_PRIV_VAL]], double* [[G_IN_REF]],
       // LAMBDA: [[TMP_G1_PRIV_REF:%.+]] = load double*, double** [[TMP_G1_PRIVATE]],
       // LAMBDA: [[TMP_G1_PRIV_VAL:%.+]] = load double, double* [[TMP_G1_PRIV_REF]],
-      // LAMBDA: store{{.*}} double [[TMP_G1_PRIV_VAL]], double* [[TMP_G1_VAL]],
+      // LAMBDA: store{{.*}} double [[TMP_G1_PRIV_VAL]], double* %
 
       // LAMBDA: [[SVAR_PRIV_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SVAR_PRIVATE]],
       // LAMBDA: store i{{[0-9]+}} [[SVAR_PRIV_VAL]], i{{[0-9]+}}* [[SVAR_IN_REF]],
@@ -185,10 +185,10 @@
 
       // init private variables
       // LAMBDA: [[G_IN_REF:%.+]] = load double*, double** [[G_PRIVATE_ADDR]],
+      // LAMBDA: [[G1_IN_REF:%.+]] = load double*, double** [[G1_PRIVATE_ADDR]],
       // LAMBDA: [[SVAR_IN_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_PRIVATE_ADDR]],
       // LAMBDA: [[SFVAR_IN_REF:%.+]] = load float*, float** [[SFVAR_PRIVATE_ADDR]],
 
-      // LAMBDA: [[G1_IN_REF:%.+]] = load double*, double** [[G1_PRIVATE_ADDR]],
       // LAMBDA: store double* [[G1_PRIVATE]], double** [[TMP_G1_PRIVATE]],
 
       // LAMBDA: call {{.*}}void @__kmpc_for_static_init_4(
@@ -222,7 +222,7 @@
       // LAMBDA: store{{.*}} double [[G_PRIV_VAL]], double* [[G_IN_REF]],
       // LAMBDA: [[TMP_G1_PRIV_REF:%.+]] = load double*, double** [[TMP_G1_PRIVATE]],
       // LAMBDA: [[TMP_G1_PRIV_VAL:%.+]] = load double, double* [[TMP_G1_PRIV_REF]],
-      // LAMBDA: store{{.*}} double [[TMP_G1_PRIV_VAL]], double* [[G1_IN_REF]],
+      // LAMBDA: store{{.*}} double [[TMP_G1_PRIV_VAL]], double* %
       // LAMBDA: [[SVAR_PRIV_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SVAR_PRIVATE]],
       // LAMBDA: store i{{[0-9]+}} [[SVAR_PRIV_VAL]], i{{[0-9]+}}* [[SVAR_IN_REF]],
       // LAMBDA: [[SFVAR_PRIV_VAL:%.+]] = load float, float* [[SFVAR_PRIVATE]],
@@ -321,8 +321,8 @@
 // CHECK: [[T_VAR_ADDR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR_ADDR]],
 // CHECK: [[VEC_ADDR_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
 // CHECK: [[S_ARR_ADDR_REF:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[S_ARR_ADDR]],
-// CHECK: [[SVAR_ADDR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_ADDR]],
 // CHECK: [[VAR_ADDR_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[VAR_ADDR]],
+// CHECK: [[SVAR_ADDR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_ADDR]],
 // CHECK: store [[S_FLOAT_TY]]* [[VAR_ADDR_REF]], [[S_FLOAT_TY]]** [[TMP]],
 // CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[OMP_IS_LAST]],
 
@@ -337,7 +337,7 @@
 // CHECK: [[S_ARR_DONE:%.+]] = icmp {{.+}} [[S_ARR_NEXT]], [[S_ARR_END]]
 // CHECK: br i1 [[S_ARR_DONE]], label %[[S_ARR_CST_END:.+]], label %[[S_ARR_CST_LOOP]]
 // CHECK: [[S_ARR_CST_END]]:
-// CHECK: [[TMP_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP]],
+// CHECK: [[TMP_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %
 // CHECK: call void [[S_FLOAT_TY_DEF_CONSTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
 // CHECK: store [[S_FLOAT_TY]]* [[VAR_PRIV]], [[S_FLOAT_TY]]** [[TMP_PRIV]],
 
@@ -434,7 +434,7 @@
 // CHECK: [[S_ARR_DONE:%.+]] = icmp {{.+}} [[S_ARR_NEXT]], [[S_ARR_END]]
 // CHECK: br i1 [[S_ARR_DONE]], label %[[S_ARR_CST_END:.+]], label %[[S_ARR_CST_LOOP]]
 // CHECK: [[S_ARR_CST_END]]:
-// CHECK: [[VAR_ADDR_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[VAR_ADDR]],
+// CHECK: [[VAR_ADDR_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %
 // CHECK: call void [[S_FLOAT_TY_DEF_CONSTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
 // CHECK: store [[S_FLOAT_TY]]* [[VAR_PRIV]], [[S_FLOAT_TY]]** [[TMP_PRIV]],
 
@@ -577,7 +577,7 @@
 // CHECK: br i1 [[CPY_IS_FINISHED]], label %[[S_ARR_COPY_DONE]], label %[[S_ARR_COPY_BLOCK]]
 // CHECK: [[S_ARR_COPY_DONE]]:
 // CHECK: [[TMP_VAL:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV]],
-// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_REF]] to i8*
+// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* %{{.+}} to i8*
 // CHECK: [[TMP_VAL_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL]] to i8*
 // CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST]],{{.+}})
 // CHECK: ret void
@@ -628,7 +628,7 @@
 // CHECK: [[S_ARR_DONE:%.+]] = icmp {{.+}} [[S_ARR_NEXT]], [[S_ARR_END]]
 // CHECK: br i1 [[S_ARR_DONE]], label %[[S_ARR_CST_END:.+]], label %[[S_ARR_CST_LOOP]]
 // CHECK: [[S_ARR_CST_END]]:
-// CHECK: [[VAR_ADDR_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[VAR_ADDR]],
+// CHECK: [[VAR_ADDR_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** %
 // CHECK: call void [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]* [[VAR_PRIV]])
 // CHECK: store [[S_INT_TY]]* [[VAR_PRIV]], [[S_INT_TY]]** [[TMP_PRIV]],
 
diff --git a/clang/test/OpenMP/distribute_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/distribute_simd_firstprivate_codegen.cpp
index 36bd145..858a7b9 100644
--- a/clang/test/OpenMP/distribute_simd_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_simd_firstprivate_codegen.cpp
@@ -109,7 +109,7 @@
       // LAMBDA-DAG: [[G_VAL:%.+]] = load{{.+}} double, double* [[G_ADDR_VAL]],
       // LAMBDA-DAG: store double [[G_VAL]], double* [[G_PRIVATE]],
       // LAMBDA-DAG: [[G1_VAL_REF:%.+]] = load double*, double** [[G1_REF]],
-      // LAMBDA-DAG: [[G1_VAL:%.+]] = load{{.+}} double, double* [[G1_VAL_REF]],
+      // LAMBDA-DAG: [[G1_VAL:%.+]] = load{{.+}} double, double* %
       // LAMBDA-DAG: store double [[G1_VAL]], double* [[G1_PRIVATE]],
       // LAMBDA-DAG: store double* [[G1_PRIVATE]], double** [[TMP_PRIVATE]],
       // LAMBDA-DAG: [[SVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SVAR_ADDR_VAL]],
@@ -284,7 +284,7 @@
 // CHECK-DAG: store{{.+}} [[VAR_ADDR_VAL]],{{.+}} [[TMP]],
 // CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP]],
 // CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_PRIV]] to{{.+}}
-// CHECK-DAG: [[TMP_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL]] to{{.+}}
+// CHECK-DAG: [[TMP_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* %{{.+}} to{{.+}}
 // CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[TMP_BCAST]],{{.+}})
 // CHECK-DAG: store [[S_FLOAT_TY]]* [[VAR_PRIV]], [[S_FLOAT_TY]]** [[TMP_PRIV]],
 
@@ -379,7 +379,7 @@
 // CHECK-DAG: store{{.+}} [[VAR_ADDR_VAL]],{{.+}} [[TMP]],
 // CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP]],
 // CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_PRIV]] to{{.+}}
-// CHECK-DAG: [[TMP_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL]] to{{.+}}
+// CHECK-DAG: [[TMP_BCAST:%.+]] = bitcast [[S_INT_TY]]* %{{.+}} to{{.+}}
 // CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[TMP_BCAST]],{{.+}})
 // CHECK-DAG: store [[S_INT_TY]]* [[VAR_PRIV]], [[S_INT_TY]]** [[TMP_PRIV]],
 
diff --git a/clang/test/OpenMP/distribute_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/distribute_simd_lastprivate_codegen.cpp
index 22a24a6..6bc146f 100644
--- a/clang/test/OpenMP/distribute_simd_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_simd_lastprivate_codegen.cpp
@@ -104,9 +104,9 @@
 
       // init private variables
       // LAMBDA: [[G_IN_REF:%.+]] = load double*, double** [[G_PRIVATE_ADDR]],
+      // LAMBDA: [[G1_IN_REF:%.+]] = load double*, double** [[G1_PRIVATE_ADDR]],
       // LAMBDA: [[SVAR_IN_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_PRIVATE_ADDR]],
       // LAMBDA: [[SFVAR_IN_REF:%.+]] = load float*, float** [[SFVAR_PRIVATE_ADDR]],
-      // LAMBDA: [[G1_IN_REF:%.+]] = load double*, double** [[G1_PRIVATE_ADDR]],
       // LAMBDA: store double* [[G1_IN_REF]], double** [[TMP_G1]],
       // LAMBDA: [[TMP_G1_VAL:%.+]] = load double*, double** [[TMP_G1]],
       // LAMBDA: store double* [[G1_PRIVATE]], double** [[TMP_G1_PRIVATE]],
@@ -151,7 +151,7 @@
       // LAMBDA: store{{.*}} double [[G_PRIV_VAL]], double* [[G_IN_REF]],
       // LAMBDA: [[TMP_G1_PRIV_REF:%.+]] = load double*, double** [[TMP_G1_PRIVATE]],
       // LAMBDA: [[TMP_G1_PRIV_VAL:%.+]] = load double, double* [[TMP_G1_PRIV_REF]],
-      // LAMBDA: store{{.*}} double [[TMP_G1_PRIV_VAL]], double* [[TMP_G1_VAL]],
+      // LAMBDA: store{{.*}} double [[TMP_G1_PRIV_VAL]], double* %
 
       // LAMBDA: [[SVAR_PRIV_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SVAR_PRIVATE]],
       // LAMBDA: store i{{[0-9]+}} [[SVAR_PRIV_VAL]], i{{[0-9]+}}* [[SVAR_IN_REF]],
@@ -250,11 +250,13 @@
 // CHECK: [[T_VAR_ADDR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR_ADDR]],
 // CHECK: [[VEC_ADDR_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
 // CHECK: [[S_ARR_ADDR_REF:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[S_ARR_ADDR]],
-// CHECK: [[SVAR_ADDR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_ADDR]],
 // CHECK: [[VAR_ADDR_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[VAR_ADDR]],
+// CHECK: [[SVAR_ADDR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_ADDR]],
 // CHECK: store [[S_FLOAT_TY]]* [[VAR_ADDR_REF]], [[S_FLOAT_TY]]** [[TMP]],
-// CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[OMP_IS_LAST]],
 // CHECK: [[TMP_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP]],
+// CHECK: store [[S_FLOAT_TY]]* [[TMP_REF]], [[S_FLOAT_TY]]** [[TMP1:%.+]],
+// CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[OMP_IS_LAST]],
+// CHECK: [[TMP_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP1]],
 // the distribute loop
 // CHECK: call void @__kmpc_for_static_init_4(
 // assignment: vec[i] = t_var;
@@ -400,7 +402,7 @@
 // CHECK: br i1 [[CPY_IS_FINISHED]], label %[[S_ARR_COPY_DONE]], label %[[S_ARR_COPY_BLOCK]]
 // CHECK: [[S_ARR_COPY_DONE]]:
 // CHECK: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
-// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_REF]] to i8*
+// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* %{{.+}} to i8*
 // CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
 // CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
 // CHECK: ret void
diff --git a/clang/test/OpenMP/for_reduction_codegen.cpp b/clang/test/OpenMP/for_reduction_codegen.cpp
index 68946a8..2e616db 100644
--- a/clang/test/OpenMP/for_reduction_codegen.cpp
+++ b/clang/test/OpenMP/for_reduction_codegen.cpp
@@ -292,14 +292,16 @@
 // CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
 
 // CHECK: [[T_VAR_REF:%.+]] = load float*, float** %
+// CHECK: [[VAR_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %
 // CHECK: [[VAR1_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %
 // CHECK: [[T_VAR1_REF:%.+]] = load float*, float** %
 
 // For + reduction operation initial value of private variable is 0.
 // CHECK: store float 0.0{{.+}}, float* [[T_VAR_PRIV]],
 
-// For & reduction operation initial value of private variable is ones in all bits.
 // CHECK: [[VAR_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %
+
+// For & reduction operation initial value of private variable is ones in all bits.
 // CHECK: call {{.*}} [[S_FLOAT_TY_CONSTR:@.+]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
 
 // For && reduction operation initial value of private variable is 1.0.
@@ -1104,6 +1106,8 @@
 // CHECK: [[VAR3_ORIG:%.+]] = load [4 x [[S_FLOAT_TY]]]*, [4 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
 // CHECK: store [4 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], [4 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR:%.+]],
 // CHECK: [[VAR3_ORIG:%.+]] = load [4 x [[S_FLOAT_TY]]]*, [4 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
+// CHECK: store [4 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], [4 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR:%.+]],
+// CHECK: [[VAR3_ORIG:%.+]] = load [4 x [[S_FLOAT_TY]]]*, [4 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
 
 // CHECK: [[LOW:%.+]] = getelementptr inbounds [4 x [[S_FLOAT_TY]]], [4 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], i64 0, i64 1
 // CHECK: [[VAR3_ORIG:%.+]] = load [4 x [[S_FLOAT_TY]]]*, [4 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
@@ -1135,6 +1139,8 @@
 // CHECK: [[VAR3_ORIG:%.+]] = load [4 x [[S_FLOAT_TY]]]*, [4 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
 // CHECK: store [4 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], [4 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR:%.+]],
 // CHECK: [[VAR3_ORIG:%.+]] = load [4 x [[S_FLOAT_TY]]]*, [4 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
+// CHECK: store [4 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], [4 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR:%.+]],
+// CHECK: [[VAR3_ORIG:%.+]] = load [4 x [[S_FLOAT_TY]]]*, [4 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
 
 // CHECK: [[LOW:%.+]] = getelementptr inbounds [4 x [[S_FLOAT_TY]]], [4 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], i64 0, i64 0
 // CHECK: [[VAR3_ORIG:%.+]] = load [4 x [[S_FLOAT_TY]]]*, [4 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
@@ -1163,6 +1169,8 @@
 // CHECK: [[VAR3_ORIG:%.+]] = load [4 x [[S_FLOAT_TY]]]*, [4 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
 // CHECK: store [4 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], [4 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR:%.+]],
 // CHECK: [[VAR3_ORIG:%.+]] = load [4 x [[S_FLOAT_TY]]]*, [4 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
+// CHECK: store [4 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], [4 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR:%.+]],
+// CHECK: [[VAR3_ORIG:%.+]] = load [4 x [[S_FLOAT_TY]]]*, [4 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
 
 // CHECK: [[LAST:%.+]] = ptrtoint [[S_FLOAT_TY]]* %{{.+}} to i64
 // CHECK: [[FIRST:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW:%.+]] to i64
@@ -1197,6 +1205,8 @@
 // CHECK: [[VAR3_ORIG:%.+]] = load [4 x [[S_FLOAT_TY]]]*, [4 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
 // CHECK: store [4 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], [4 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR:%.+]],
 // CHECK: [[VAR3_ORIG:%.+]] = load [4 x [[S_FLOAT_TY]]]*, [4 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
+// CHECK: store [4 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], [4 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR:%.+]],
+// CHECK: [[VAR3_ORIG:%.+]] = load [4 x [[S_FLOAT_TY]]]*, [4 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
 // CHECK: getelementptr inbounds [4 x [[S_FLOAT_TY]]], [4 x [[S_FLOAT_TY]]]* [[VAR3_PRIV]], i32 0, i32 0
 // CHECK: getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* %{{.+}}, i64 4
 
@@ -1233,6 +1243,7 @@
 // CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
 
 // CHECK: [[T_VAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** %
+// CHECK: [[VAR_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** %
 // CHECK: [[VAR1_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** %
 // CHECK: [[T_VAR1_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** %
 
diff --git a/clang/test/OpenMP/for_reduction_codegen_UDR.cpp b/clang/test/OpenMP/for_reduction_codegen_UDR.cpp
index 0eee597..c7c3f93 100644
--- a/clang/test/OpenMP/for_reduction_codegen_UDR.cpp
+++ b/clang/test/OpenMP/for_reduction_codegen_UDR.cpp
@@ -793,6 +793,8 @@
 // CHECK: [[VAR3_ORIG:%.+]] = load [4 x [[S_FLOAT_TY]]]*, [4 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
 // CHECK: store [4 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], [4 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR:%.+]],
 // CHECK: [[VAR3_ORIG:%.+]] = load [4 x [[S_FLOAT_TY]]]*, [4 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
+// CHECK: store [4 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], [4 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR:%.+]],
+// CHECK: [[VAR3_ORIG:%.+]] = load [4 x [[S_FLOAT_TY]]]*, [4 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
 
 // CHECK: [[LOW:%.+]] = getelementptr inbounds [4 x [[S_FLOAT_TY]]], [4 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], i64 0, i64 1
 // CHECK: [[VAR3_ORIG:%.+]] = load [4 x [[S_FLOAT_TY]]]*, [4 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
@@ -823,6 +825,8 @@
 // CHECK: [[VAR3_ORIG:%.+]] = load [4 x [[S_FLOAT_TY]]]*, [4 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
 // CHECK: store [4 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], [4 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR:%.+]],
 // CHECK: [[VAR3_ORIG:%.+]] = load [4 x [[S_FLOAT_TY]]]*, [4 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
+// CHECK: store [4 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], [4 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR:%.+]],
+// CHECK: [[VAR3_ORIG:%.+]] = load [4 x [[S_FLOAT_TY]]]*, [4 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
 // CHECK: [[VAR3_VOID_PTR:%.+]] = call i8* @__kmpc_alloc(i32 [[GTID:%.+]], i64 48, i8* inttoptr (i64 6 to i8*))
 // CHECK: [[VAR3_PRIV:%.+]] = bitcast i8* [[VAR3_VOID_PTR]] to [4 x %struct.S]*
 // CHECK: getelementptr inbounds [4 x [[S_FLOAT_TY]]], [4 x [[S_FLOAT_TY]]]* [[VAR3_PRIV]], i32 0, i32 0
diff --git a/clang/test/OpenMP/parallel_firstprivate_codegen.cpp b/clang/test/OpenMP/parallel_firstprivate_codegen.cpp
index 6696415..8aa6466 100644
--- a/clang/test/OpenMP/parallel_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/parallel_firstprivate_codegen.cpp
@@ -180,10 +180,10 @@
     // LAMBDA: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
     // LAMBDA: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
     // LAMBDA-64: [[A_CONV:%.+]] = bitcast i64* [[A_PRIV]] to i32*
-    // LAMBDA-64: store i32* [[A_CONV]], i32** [[REFA:%.+]],
-    // LAMBDA-32: store i32* [[A_PRIV]], i32** [[REFA:%.+]],
     // LAMBDA-64: [[B_CONV:%.+]] = bitcast i64* [[B_PRIV]] to i32*
     // LAMBDA-64: [[C_CONV:%.+]] = bitcast i64* [[C_PRIV]] to i32*
+    // LAMBDA-64: store i32* [[A_CONV]], i32** [[REFA:%.+]],
+    // LAMBDA-32: store i32* [[A_PRIV]], i32** [[REFA:%.+]],
     // LAMBDA-64: store i32* [[C_CONV]], i32** [[REFC:%.+]],
     // LAMBDA-32: store i32* [[C_PRIV]], i32** [[REFC:%.+]],
     // LAMBDA-NEXT: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
@@ -303,10 +303,10 @@
 // BLOCKS: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
 // BLOCKS: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
 // BLOCKS-64: [[A_CONV:%.+]] = bitcast i64* [[A_PRIV]] to i32*
-// BLOCKS-64: store i32* [[A_CONV]], i32** [[REFA:%.+]],
-// BLOCKS-32: store i32* [[A_PRIV]], i32** [[REFA:%.+]],
 // BLOCKS-64: [[B_CONV:%.+]] = bitcast i64* [[B_PRIV]] to i32*
 // BLOCKS-64: [[C_CONV:%.+]] = bitcast i64* [[C_PRIV]] to i32*
+// BLOCKS-64: store i32* [[A_CONV]], i32** [[REFA:%.+]],
+// BLOCKS-32: store i32* [[A_PRIV]], i32** [[REFA:%.+]],
 // BLOCKS-64: store i32* [[C_CONV]], i32** [[REFC:%.+]],
 // BLOCKS-32: store i32* [[C_PRIV]], i32** [[REFC:%.+]],
 // BLOCKS-NEXT: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
@@ -452,10 +452,10 @@
 // CHECK: store i{{[0-9]+}} {{.+}}, i{{[0-9]+}}* [[B_PRIV]]
 // CHECK: store i{{[0-9]+}} {{.+}}, i{{[0-9]+}}* [[C_PRIV]]
 // CHECK-64: [[A_CONV:%.+]] = bitcast i64* [[A_PRIV:%.+]] to i32*
-// CHECK-64: store i32* [[A_CONV]], i32** [[REFA:%.+]],
-// CHECK-32: store i32* [[A_PRIV]], i32** [[REFA:%.+]],
 // CHECK-64: [[B_CONV:%.+]] = bitcast i64* [[B_PRIV:%.+]] to i32*
 // CHECK-64: [[C_CONV:%.+]] = bitcast i64* [[C_PRIV:%.+]] to i32*
+// CHECK-64: store i32* [[A_CONV]], i32** [[REFA:%.+]],
+// CHECK-32: store i32* [[A_PRIV]], i32** [[REFA:%.+]],
 // CHECK-64: store i32* [[C_CONV]], i32** [[REFC:%.+]],
 // CHECK-32: store i32* [[C_PRIV]], i32** [[REFC:%.+]],
 // CHECK: bitcast [4 x i{{[0-9]+}}]* [[E_PRIV]] to i8*
diff --git a/clang/test/OpenMP/target_map_codegen.cpp b/clang/test/OpenMP/target_map_codegen.cpp
index 44d7ffc..8ed201a 100644
--- a/clang/test/OpenMP/target_map_codegen.cpp
+++ b/clang/test/OpenMP/target_map_codegen.cpp
@@ -4581,12 +4581,16 @@
 // CK26: define {{.+}}[[CALL00]]({{.*}}i32*{{.*}}[[PVTARG:%.+]])
 // CK26: store i32* [[PVTARG]], i32** [[PVTADDR:%.+]],
 // CK26: [[ADDR:%.+]] = load i32*, i32** [[PVTADDR]],
+// CK26: store i32* [[ADDR]], i32** [[PVTADDR:%.+]],
+// CK26: [[ADDR:%.+]] = load i32*, i32** [[PVTADDR]],
 // CK26: [[VAL:%.+]] = load i32, i32* [[ADDR]],
 // CK26: add nsw i32 [[VAL]], 1
 
 // CK26: define {{.+}}[[CALL01]]({{.*}}float*{{.*}}[[PVTARG:%.+]])
 // CK26: store float* [[PVTARG]], float** [[PVTADDR:%.+]],
 // CK26: [[ADDR:%.+]] = load float*, float** [[PVTADDR]],
+// CK26: store float* [[ADDR]], float** [[PVTADDR:%.+]],
+// CK26: [[ADDR:%.+]] = load float*, float** [[PVTADDR]],
 // CK26: [[VAL:%.+]] = load float, float* [[ADDR]],
 // CK26: [[EXT:%.+]] = fpext float [[VAL]] to double
 // CK26: fadd double [[EXT]], 1.000000e+00
@@ -4594,12 +4598,16 @@
 // CK26: define {{.+}}[[CALL02]]({{.*}}i32*{{.*}}[[PVTARG:%.+]])
 // CK26: store i32* [[PVTARG]], i32** [[PVTADDR:%.+]],
 // CK26: [[ADDR:%.+]] = load i32*, i32** [[PVTADDR]],
+// CK26: store i32* [[ADDR]], i32** [[PVTADDR:%.+]],
+// CK26: [[ADDR:%.+]] = load i32*, i32** [[PVTADDR]],
 // CK26: [[VAL:%.+]] = load i32, i32* [[ADDR]],
 // CK26: add nsw i32 [[VAL]], 1
 
 // CK26: define {{.+}}[[CALL03]]({{.*}}float*{{.*}}[[PVTARG:%.+]])
 // CK26: store float* [[PVTARG]], float** [[PVTADDR:%.+]],
 // CK26: [[ADDR:%.+]] = load float*, float** [[PVTADDR]],
+// CK26: store float* [[ADDR]], float** [[PVTADDR:%.+]],
+// CK26: [[ADDR:%.+]] = load float*, float** [[PVTADDR]],
 // CK26: [[VAL:%.+]] = load float, float* [[ADDR]],
 // CK26: [[EXT:%.+]] = fpext float [[VAL]] to double
 // CK26: fadd double [[EXT]], 1.000000e+00
diff --git a/clang/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp
index 5487fa8..c268b72 100644
--- a/clang/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp
@@ -274,6 +274,7 @@
 // CHECK-DAG: [[TS_ARR_TE_PAR:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[TS_ARR_PRIV]],
 // CHECK-DAG: [[TVAR_TE_PAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TVAR_PRIV]],
 
+// CHECK: [[TVAR_TE_PAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** %
 // CHECK: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 4, {{.+}} @[[TOUTL1:.+]] to {{.+}}, [2 x i{{[0-9]+}}]* [[TVEC_TE_PAR]], i{{[0-9]+}} [[TT_VAR_TE_PAR]], [2 x [[S_INT_TY]]]* [[TS_ARR_TE_PAR]], [[S_INT_TY]]* [[TVAR_TE_PAR]])
 // CHECK: ret void
 
@@ -304,15 +305,17 @@
 // CHECK: store [2 x [[S_INT_TY]]]* {{.+}}, [2 x [[S_INT_TY]]]** [[S_ARR_ADDR]],
 // CHECK: store [[S_INT_TY]]* {{.+}}, [[S_INT_TY]]** [[VAR_ADDR]],
 
-
 // T_VAR and preparation variables
 // CHECK: [[VEC_ADDR_VAL:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
 // CHECK-64: [[CONV_TVAR:%.+]] = bitcast i64* [[T_VAR_ADDR]] to i32*
 // CHECK: [[S_ARR_ADDR_REF:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_ADDR]],
 
+// CHECK: [[VAR_REF:%.+]] = load{{.+}} [[VAR_ADDR]],
+// CHECK: store [[S_INT_TY]]* [[VAR_REF]], [[S_INT_TY]]** [[VAR_ADDR:%.+]],
+
 // firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
-// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8* 
-// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8* 
+// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
+// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
 // CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
 
 // firstprivate(s_arr)
diff --git a/clang/test/OpenMP/target_teams_distribute_lastprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_lastprivate_codegen.cpp
index 26a7c0d..459384f 100644
--- a/clang/test/OpenMP/target_teams_distribute_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_lastprivate_codegen.cpp
@@ -107,15 +107,15 @@
       // LAMBDA-32: store i32 [[SFVAR_IN]], i32* [[SFVAR_PRIVATE_ADDR]],
 
       // init private variables
-      // LAMBDA-64: [[G_IN_REF:%.+]] = bitcast i64* [[G_PRIVATE_ADDR]] to double*
       // LAMBDA-32: [[G_IN_REF:%.+]] = load double*, double** [[G_PRIVATE_ADDR]],
+      // LAMBDA-32: [[G1_IN_REF:%.+]] = load double*, double** [[G1_PRIVATE_ADDR]],
+      // LAMBDA-32: [[SFVAR_IN_REF:%.+]] = bitcast i32* [[SFVAR_PRIVATE_ADDR]] to float*
+      // LAMBDA-64: [[G_IN_REF:%.+]] = bitcast i64* [[G_PRIVATE_ADDR]] to double*
       // LAMBDA-64: [[G1_IN_REF:%.+]] = bitcast i64* [[G1_PRIVATE_ADDR]] to double*
-      // LAMBDA-64: store double* [[G1_IN_REF]], double** [[G1_IN_ADDR_REF:%.+]],
       // LAMBDA-64: [[SVAR_IN_REF:%.+]] = bitcast i64* [[SVAR_PRIVATE_ADDR]] to i32*
       // LAMBDA-64: [[SFVAR_IN_REF:%.+]] = bitcast i64* [[SFVAR_PRIVATE_ADDR]] to float*
-      // LAMBDA-32: [[SFVAR_IN_REF:%.+]] = bitcast i32* [[SFVAR_PRIVATE_ADDR]] to float*
+      // LAMBDA-64: store double* [[G1_IN_REF]], double** [[G1_IN_ADDR_REF:%.+]],
       // LAMBDA-64: [[G1_IN_REF:%.+]] = load double*, double** [[G1_IN_ADDR_REF]],
-      // LAMBDA-32: [[G1_IN_REF:%.+]] = load double*, double** [[G1_PRIVATE_ADDR]],
       // LAMBDA: store double* [[G1_PRIVATE]], double** [[TMP_G1_PRIVATE]],
       g = 1;
       g1 = 1;
@@ -147,7 +147,7 @@
       // LAMBDA: store{{.*}} double [[G_PRIV_VAL]], double* [[G_IN_REF]],
       // LAMBDA: [[TMP_G1_PRIV_REF:%.+]] = load double*, double** [[TMP_G1_PRIVATE]],
       // LAMBDA: [[TMP_G1_PRIV_VAL:%.+]] = load double, double* [[TMP_G1_PRIV_REF]],
-      // LAMBDA: store{{.*}} double [[TMP_G1_PRIV_VAL]], double* [[G1_IN_REF]],
+      // LAMBDA: store{{.*}} double [[TMP_G1_PRIV_VAL]], double* %
 
       // LAMBDA: [[SVAR_PRIV_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SVAR_PRIVATE]],
       // LAMBDA-64: store i{{[0-9]+}} [[SVAR_PRIV_VAL]], i{{[0-9]+}}* [[SVAR_IN_REF]],
@@ -244,9 +244,9 @@
 // CHECK: [[VEC_ADDR_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
 // CHECK-64: [[T_VAR_ADDR_REF:%.+]] = bitcast i64* [[T_VAR_ADDR]] to i32*
 // CHECK: [[S_ARR_ADDR_REF:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[S_ARR_ADDR]],
+// CHECK:  [[VAR_ADDR_REF:%.+]] = load {{.+}}, {{.+}} [[VAR_ADDR]],
 // CHECK-64: [[SVAR_ADDR_REF:%.+]] = bitcast i64* [[SVAR_ADDR]] to i32*
 // CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[OMP_IS_LAST]],
-// CHECK:  [[VAR_ADDR_REF:%.+]] = load {{.+}}, {{.+}} [[VAR_ADDR]],
 // the distribute loop
 // CHECK: call void @__kmpc_for_static_init_4(
 // assignment: vec[i] = t_var;
@@ -291,7 +291,7 @@
 // CHECK: br i1 [[CPY_IS_FINISHED]], label %[[S_ARR_COPY_DONE]], label %[[S_ARR_COPY_BLOCK]]
 // CHECK: [[S_ARR_COPY_DONE]]:
 // CHECK: [[TMP_VAL1:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
-// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_ADDR_REF]] to i8*
+// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* %{{.+}} to i8*
 // CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL1]] to i8*
 // CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
 // CHECK: [[SVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[S_VAR_PRIV]],
@@ -346,8 +346,8 @@
 // CHECK: [[VEC_ADDR_REF1:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR1]],
 // CHECK-64: [[T_VAR_ADDR_REF1:%.+]] = bitcast i64* [[T_VAR_ADDR1]] to i32*
 // CHECK: [[S_ARR_ADDR_REF1:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_ADDR1]],
-// CHECK-DAG: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[OMP_IS_LAST1]],
 // CHECK: [[VAR_ADDR1_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[VAR_ADDR1]],
+// CHECK-DAG: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[OMP_IS_LAST1]],
 // CHECK-DAG: store [[S_INT_TY]]* [[VAR_PRIV1]], [[S_INT_TY]]** [[TMP_PRIV1]],
 // CHECK: call void @__kmpc_for_static_init_4(
 // assignment: vec[i] = t_var;
@@ -393,7 +393,7 @@
 // CHECK: br i1 [[CPY_IS_FINISHED]], label %[[S_ARR_COPY_DONE]], label %[[S_ARR_COPY_BLOCK]]
 // CHECK: [[S_ARR_COPY_DONE]]:
 // CHECK: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
-// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_ADDR1_REF]] to i8*
+// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* %{{.+}} to i8*
 // CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
 // CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
 // CHECK: ret void
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp
index 843b667..9c91538 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp
@@ -391,6 +391,7 @@
 // CHECK-DAG: [[TS_ARR_TE_PAR:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[TS_ARR_PRIV]],
 // CHECK-DAG: [[TVAR_TE_PAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TVAR_PRIV]],
 
+// CHECK: [[TVAR_TE_PAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** %
 // CHECK: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 4, {{.+}} @[[TOUTL1:.+]] to {{.+}}, [2 x i{{[0-9]+}}]* [[TVEC_TE_PAR]], i{{[0-9]+}} [[TT_VAR_TE_PAR]], [2 x [[S_INT_TY]]]* [[TS_ARR_TE_PAR]], [[S_INT_TY]]* [[TVAR_TE_PAR]])
 // CHECK: ret void
 
@@ -426,9 +427,12 @@
 // CHECK-64: [[CONV_TVAR:%.+]] = bitcast i64* [[T_VAR_ADDR]] to i32*
 // CHECK: [[S_ARR_ADDR_REF:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_ADDR]],
 
+// CHECK: [[VAR_REF:%.+]] = load{{.+}} [[VAR_ADDR]],
+// CHECK: store [[S_INT_TY]]* [[VAR_REF]], [[S_INT_TY]]** [[VAR_ADDR:%.+]],
+
 // firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
-// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8* 
-// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8* 
+// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
+// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
 // CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
 
 // firstprivate(s_arr)
@@ -487,9 +491,12 @@
 // CHECK-64: [[CONV_TVAR:%.+]] = bitcast i64* [[T_VAR_ADDR]] to i32*
 // CHECK: [[S_ARR_ADDR_REF:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_ADDR]],
 
+// CHECK: [[VAR_REF:%.+]] = load{{.+}} [[VAR_ADDR]],
+// CHECK: store [[S_INT_TY]]* [[VAR_REF]], [[S_INT_TY]]** [[VAR_ADDR:%.+]],
+
 // firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
-// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8* 
-// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8* 
+// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
+// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
 // CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
 
 // firstprivate(s_arr)
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_lastprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_lastprivate_codegen.cpp
index acb1c0b..c0f9eca 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_lastprivate_codegen.cpp
@@ -83,7 +83,7 @@
       // LAMBDA: [[SVAR_ADDR:%.+]] = alloca {{.+}},
       // LAMBDA: [[SFVAR_ADDR:%.+]] = alloca {{.+}},
       // LAMBDA: [[G_ADDR:%.+]] = alloca {{.+}},
-      // LAMBDA-64: [[G1_REF:%.+]] = alloca double*,
+      // LAMBDA: [[G1_REF:%.+]] = alloca double*,
       // loop variables
       // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
       // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
@@ -91,19 +91,18 @@
       // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
       // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
       // LAMBDA: [[OMP_IS_LAST:%.+]] = alloca i{{[0-9]+}},
-       
+
       // LAMBDA-DAG: store {{.+}} [[G_IN]], {{.+}} [[G_ADDR]],
       // LAMBDA-DAG: store {{.+}} [[G1_IN]], {{.+}} [[G1_ADDR]],
       // LAMBDA-DAG: store {{.+}} [[SVAR_IN]], {{.+}} [[SVAR_ADDR]],
       // LAMBDA-DAG: store {{.+}} [[SFVAR_IN]], {{.+}} [[SFVAR_ADDR]],
-    
+
       // LAMBDA-64-DAG: [[G_TGT:%.+]] = bitcast {{.+}} [[G_ADDR]] to
       // LAMBDA-32-DAG: [[G_TGT:%.+]] = load {{.+}}, {{.+}} [[G_ADDR]],
-      // LAMBDA-64-DAG: [[G1_TGT:%.+]] = load {{.+}}, {{.+}} [[G1_REF]],
-      // LAMBDA-32-DAG: [[G1_TGT:%.+]] = load {{.+}}, {{.+}} [[G1_ADDR]],
+      // LAMBDA-DAG: [[G1_TGT:%.+]] = load {{.+}}, {{.+}} [[G1_REF]],
       // LAMBDA-64-DAG: [[SVAR_TGT:%.+]] = bitcast {{.+}} [[SVAR_ADDR]] to
       // LAMBDA-DAG: [[SFVAR_TGT:%.+]] = bitcast {{.+}} [[SFVAR_ADDR]] to
-      
+
       g1 = 1;
       svar = 3;
       sfvar = 4.0;
@@ -135,7 +134,7 @@
       // LAMBDA: [[SVAR_ADDR:%.+]] = alloca {{.+}},
       // LAMBDA: [[SFVAR_ADDR:%.+]] = alloca {{.+}},
       // LAMBDA: [[G_ADDR:%.+]] = alloca {{.+}},
-      // LAMBDA-64: [[G1_REF:%.+]] = alloca double*,
+      // LAMBDA: [[G1_REF:%.+]] = alloca double*,
       // loop variables
       // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
       // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
@@ -143,16 +142,15 @@
       // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
       // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
       // LAMBDA: [[OMP_IS_LAST:%.+]] = alloca i{{[0-9]+}},
-       
+
       // LAMBDA-DAG: store {{.+}} [[G_IN]], {{.+}} [[G_ADDR]],
       // LAMBDA-DAG: store {{.+}} [[G1_IN]], {{.+}} [[G1_ADDR]],
       // LAMBDA-DAG: store {{.+}} [[SVAR_IN]], {{.+}} [[SVAR_ADDR]],
       // LAMBDA-DAG: store {{.+}} [[SFVAR_IN]], {{.+}} [[SFVAR_ADDR]],
-    
+
       // LAMBDA-64-DAG: [[G_TGT:%.+]] = bitcast {{.+}} [[G_ADDR]] to
       // LAMBDA-32-DAG: [[G_TGT:%.+]] = load {{.+}}, {{.+}} [[G_ADDR]],
-      // LAMBDA-64-DAG: [[G1_TGT:%.+]] = load {{.+}}, {{.+}} [[G1_REF]],
-      // LAMBDA-32-DAG: [[G1_TGT:%.+]] = load {{.+}}, {{.+}} [[G1_ADDR]],
+      // LAMBDA-DAG: [[G1_TGT:%.+]] = load {{.+}}, {{.+}} [[G1_REF]],
       // LAMBDA-64-DAG: [[SVAR_TGT:%.+]] = bitcast {{.+}} [[SVAR_ADDR]] to
       // LAMBDA-DAG: [[SFVAR_TGT:%.+]] = bitcast {{.+}} [[SFVAR_ADDR]] to
 
@@ -164,7 +162,7 @@
       // LAMBDA: [[OMP_IS_LAST_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[OMP_IS_LAST]],
       // LAMBDA: [[IS_LAST_IT:%.+]] = icmp ne i{{[0-9]+}} [[OMP_IS_LAST_VAL]], 0
       // LAMBDA: br i1 [[IS_LAST_IT]], label %[[OMP_LASTPRIV_BLOCK:.+]], label %[[OMP_LASTPRIV_DONE:.+]]
-      
+
       // LAMBDA: [[OMP_LASTPRIV_BLOCK]]:
       // LAMBDA-DAG: store {{.+}}, {{.+}} [[G_TGT]],
       // LAMBDA-DAG: store {{.+}}, {{.+}} [[G1_TGT]],
@@ -176,26 +174,26 @@
       // LAMBDA: ret
 
       [&]() {
-	// LAMBDA: define {{.+}} void [[INNER_LAMBDA]](%{{.+}}* [[ARG_PTR:%.+]])
-	// LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
-	g = 2;
-	g1 = 2;
-	svar = 4;
-	sfvar = 8.0;
-	// LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
-	// LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-	// LAMBDA: [[G_REF:%.+]] = load double*, double** [[G_PTR_REF]]
-	// LAMBDA: store double 2.0{{.+}}, double* [[G_REF]]
+        // LAMBDA: define {{.+}} void [[INNER_LAMBDA]](%{{.+}}* [[ARG_PTR:%.+]])
+        // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+        g = 2;
+        g1 = 2;
+        svar = 4;
+        sfvar = 8.0;
+        // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+        // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+        // LAMBDA: [[G_REF:%.+]] = load double*, double** [[G_PTR_REF]]
+        // LAMBDA: store double 2.0{{.+}}, double* [[G_REF]]
 
-	// LAMBDA: [[TMP_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
-	// LAMBDA: [[G1_REF:%.+]] = load double*, double** [[TMP_PTR_REF]]
-	// LAMBDA: store double 2.0{{.+}}, double* [[G1_REF]],
-	// LAMBDA: [[SVAR_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
-	// LAMBDA: [[SVAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_PTR_REF]]
-	// LAMBDA: store i{{[0-9]+}} 4, i{{[0-9]+}}* [[SVAR_REF]]
-	// LAMBDA: [[SFVAR_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 3
-	// LAMBDA: [[SFVAR_REF:%.+]] = load float*, float** [[SFVAR_PTR_REF]]
-	// LAMBDA: store float 8.0{{.+}}, float* [[SFVAR_REF]]
+        // LAMBDA: [[TMP_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+        // LAMBDA: [[G1_REF:%.+]] = load double*, double** [[TMP_PTR_REF]]
+        // LAMBDA: store double 2.0{{.+}}, double* [[G1_REF]],
+        // LAMBDA: [[SVAR_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+        // LAMBDA: [[SVAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_PTR_REF]]
+        // LAMBDA: store i{{[0-9]+}} 4, i{{[0-9]+}}* [[SVAR_REF]]
+        // LAMBDA: [[SFVAR_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 3
+        // LAMBDA: [[SFVAR_REF:%.+]] = load float*, float** [[SFVAR_PTR_REF]]
+        // LAMBDA: store float 8.0{{.+}}, float* [[SFVAR_REF]]
       }();
     }
   }();
@@ -237,6 +235,7 @@
 // CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_FLOAT_TY]]]*,
 // CHECK: [[VAR_ADDR:%.+]] = alloca [[S_FLOAT_TY]]*,
 // CHECK: [[SVAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[TMP_VAR_ADDR:%.+]] = alloca [[S_FLOAT_TY]]*,
 // skip loop variables
 // CHECK: {{.+}} = alloca i{{[0-9]+}},
 // CHECK: {{.+}} = alloca i{{[0-9]+}},
@@ -256,7 +255,7 @@
 // CHECK-64-DAG: [[TVAR_TGT:%.+]] = bitcast {{.+}} [[T_VAR_ADDR]] to
 // CHECK-DAG: [[VEC_TGT:%.+]] = load {{.+}}, {{.+}} [[VEC_ADDR]],
 // CHECK-DAG: [[S_ARR_TGT:%.+]] = load {{.+}}, {{.+}} [[S_ARR_ADDR]],
-// CHECK-DAG: [[VAR_TGT:%.+]] = load {{.+}}, {{.+}} [[VAR_ADDR]],
+// CHECK-DAG: [[VAR_TGT:%.+]] = load {{.+}}, {{.+}} [[TMP_VAR_ADDR]],
 // CHECK-64-DAG: [[SVAR_TGT:%.+]] = bitcast {{.+}} [[SVAR_ADDR]] to
 
 // the distribute loop
@@ -295,6 +294,7 @@
 // CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_FLOAT_TY]]]*,
 // CHECK: [[VAR_ADDR:%.+]] = alloca [[S_FLOAT_TY]]*,
 // CHECK: [[SVAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[TMP_VAR_ADDR:%.+]] = alloca [[S_FLOAT_TY]]*,
 // skip loop variables
 // CHECK: {{.+}} = alloca i{{[0-9]+}},
 // CHECK: {{.+}} = alloca i{{[0-9]+}},
@@ -314,7 +314,7 @@
 // CHECK-64-DAG: [[TVAR_TGT:%.+]] = bitcast {{.+}} [[T_VAR_ADDR]] to
 // CHECK-DAG: [[VEC_TGT:%.+]] = load {{.+}}, {{.+}} [[VEC_ADDR]],
 // CHECK-DAG: [[S_ARR_TGT:%.+]] = load {{.+}}, {{.+}} [[S_ARR_ADDR]],
-// CHECK-DAG: [[VAR_TGT:%.+]] = load {{.+}}, {{.+}} [[VAR_ADDR]],
+// CHECK-DAG: [[VAR_TGT:%.+]] = load {{.+}}, {{.+}} [[TMP_VAR_ADDR]],
 // CHECK-64-DAG: [[SVAR_TGT:%.+]] = bitcast {{.+}} [[SVAR_ADDR]] to
 
 // the distribute loop
@@ -360,6 +360,7 @@
 // CHECK: [[T_VAR_ADDR1:%.+]] = alloca i{{[0-9]+}},
 // CHECK: [[S_ARR_ADDR1:%.+]] = alloca [2 x [[S_INT_TY]]]*,
 // CHECK: [[VAR_ADDR1:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: [[TMP_VAR_ADDR1:%.+]] = alloca [[S_INT_TY]]*,
 // skip loop variables
 // CHECK: {{.+}} = alloca i{{[0-9]+}},
 // CHECK: {{.+}} = alloca i{{[0-9]+}},
@@ -378,7 +379,7 @@
 // CHECK-64-DAG: [[T_VAR_TGT:%.+]] = bitcast {{.+}} [[T_VAR_ADDR1]] to
 // CHECK-DAG: [[VEC_TGT:%.+]] = load {{.+}}, {{.+}} [[VEC_ADDR1]],
 // CHECK-DAG: [[S_ARR_TGT:%.+]] = load {{.+}}, {{.+}} [[S_ARR_ADDR1]],
-// CHECK-DAG: [[VAR_TGT:%.+]] = load {{.+}}, {{.+}} [[VAR_ADDR1]], 
+// CHECK-DAG: [[VAR_TGT:%.+]] = load {{.+}}, {{.+}} [[TMP_VAR_ADDR1]],
 
 // CHECK: call void @__kmpc_for_static_init_4(
 // CHECK: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[TPAR_OUTL:.+]] to
@@ -397,7 +398,7 @@
 // CHECK-DAG: {{.+}} = getelementptr {{.+}} [[S_ARR_TGT]],
 // CHECK: call void @llvm.memcpy.{{.+}}(
 // CHECK-DAG: [[VAR_ADDR_BCAST:%.+]] = bitcast {{.+}} [[VAR_TGT]] to
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_BCAST]],  
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_BCAST]],
 // CHECK: ret void
 
 // CHECK: define internal void [[TPAR_OUTL:@.+]](i{{[0-9]+}}* noalias [[GTID_ADDR1:%.+]], i{{[0-9]+}}* noalias %{{.+}}, {{.+}}, {{.+}}, [2 x i{{[0-9]+}}]*{{.+}} [[VEC_IN1:%.+]], i{{[0-9]+}}{{.+}} [[T_VAR_IN1:%.+]], [2 x [[S_INT_TY]]]*{{.+}} [[S_ARR_IN1:%.+]], [[S_INT_TY]]*{{.+}} [[VAR_IN1:%.+]])
@@ -411,6 +412,7 @@
 // CHECK: [[T_VAR_ADDR1:%.+]] = alloca i{{[0-9]+}},
 // CHECK: [[S_ARR_ADDR1:%.+]] = alloca [2 x [[S_INT_TY]]]*,
 // CHECK: [[VAR_ADDR1:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: [[TMP_VAR_ADDR1:%.+]] = alloca [[S_INT_TY]]*,
 // skip loop variables
 // CHECK: {{.+}} = alloca i{{[0-9]+}},
 // CHECK: {{.+}} = alloca i{{[0-9]+}},
@@ -429,7 +431,7 @@
 // CHECK-64-DAG: [[T_VAR_TGT:%.+]] = bitcast {{.+}} [[T_VAR_ADDR1]] to
 // CHECK-DAG: [[VEC_TGT:%.+]] = load {{.+}}, {{.+}} [[VEC_ADDR1]],
 // CHECK-DAG: [[S_ARR_TGT:%.+]] = load {{.+}}, {{.+}} [[S_ARR_ADDR1]],
-// CHECK-DAG: [[VAR_TGT:%.+]] = load {{.+}}, {{.+}} [[VAR_ADDR1]], 
+// CHECK-DAG: [[VAR_TGT:%.+]] = load {{.+}}, {{.+}} [[TMP_VAR_ADDR1]],
 
 // CHECK: call void @__kmpc_for_static_init_4(
 // skip body: code generation routine is same as distribute parallel for lastprivate
@@ -448,7 +450,7 @@
 // CHECK-DAG: {{.+}} = getelementptr {{.+}} [[S_ARR_TGT]],
 // CHECK: call void @llvm.memcpy.{{.+}}(
 // CHECK-DAG: [[VAR_ADDR_BCAST:%.+]] = bitcast {{.+}} [[VAR_TGT]] to
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_BCAST]],  
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_BCAST]],
 // CHECK: ret void
 
 #endif
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_firstprivate_codegen.cpp
index 0b09cc6..978e90d 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_firstprivate_codegen.cpp
@@ -391,6 +391,7 @@
 // CHECK-DAG: [[TS_ARR_TE_PAR:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[TS_ARR_PRIV]],
 // CHECK-DAG: [[TVAR_TE_PAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TVAR_PRIV]],
 
+// CHECK: [[TVAR_TE_PAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** %
 // CHECK: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 4, {{.+}} @[[TOUTL1:.+]] to {{.+}}, [2 x i{{[0-9]+}}]* [[TVEC_TE_PAR]], i{{[0-9]+}} [[TT_VAR_TE_PAR]], [2 x [[S_INT_TY]]]* [[TS_ARR_TE_PAR]], [[S_INT_TY]]* [[TVAR_TE_PAR]])
 // CHECK: ret void
 
@@ -402,6 +403,7 @@
 // CHECK: [[T_VAR_ADDR:%.+]] = alloca i{{[0-9]+}},
 // CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
 // CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: [[TMP_VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
 // Skip temp vars for loop
 // CHECK: alloca i{{[0-9]+}},
 // CHECK: alloca i{{[0-9]+}},
@@ -444,7 +446,7 @@
 // CHECK-DAG: [[S_ARR_SRC]] = getelementptr {{.+}} [[S_ARR_SRC_COPY]],
 
 // firstprivate(var)
-// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load{{.+}} [[VAR_ADDR]],
+// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load{{.+}} [[TMP_VAR_ADDR]],
 // CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP2]])
 // CHECK-DAG: call void @{{.+}}({{.+}} [[VAR_PRIV]], {{.+}} [[VAR_ADDR_REF]], {{.+}} [[AGG_TMP2]])
 // CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP2]])
@@ -463,6 +465,7 @@
 // CHECK: [[T_VAR_ADDR:%.+]] = alloca i{{[0-9]+}},
 // CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
 // CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: [[TMP_VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
 // Skip temp vars for loop
 // CHECK: alloca i{{[0-9]+}},
 // CHECK: alloca i{{[0-9]+}},
@@ -505,7 +508,7 @@
 // CHECK-DAG: [[S_ARR_SRC]] = getelementptr {{.+}} [[S_ARR_SRC_COPY]],
 
 // firstprivate(var)
-// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load{{.+}} [[VAR_ADDR]],
+// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load{{.+}} [[TMP_VAR_ADDR]],
 // CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP2]])
 // CHECK-DAG: call void @{{.+}}({{.+}} [[VAR_PRIV]], {{.+}} [[VAR_ADDR_REF]], {{.+}} [[AGG_TMP2]])
 // CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP2]])
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_lastprivate_codegen.cpp
index c3d3861..ca09990 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_lastprivate_codegen.cpp
@@ -83,7 +83,7 @@
       // LAMBDA: [[SVAR_ADDR:%.+]] = alloca {{.+}},
       // LAMBDA: [[SFVAR_ADDR:%.+]] = alloca {{.+}},
       // LAMBDA: [[G_ADDR:%.+]] = alloca {{.+}},
-      // LAMBDA-64: [[G1_REF:%.+]] = alloca double*,
+      // LAMBDA: [[G1_REF:%.+]] = alloca double*,
       // loop variables
       // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
       // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
@@ -99,8 +99,7 @@
 
       // LAMBDA-64-DAG: [[G_TGT:%.+]] = bitcast {{.+}} [[G_ADDR]] to
       // LAMBDA-32-DAG: [[G_TGT:%.+]] = load {{.+}}, {{.+}} [[G_ADDR]],
-      // LAMBDA-64-DAG: [[G1_TGT:%.+]] = load {{.+}}, {{.+}} [[G1_REF]],
-      // LAMBDA-32-DAG: [[G1_TGT:%.+]] = load {{.+}}, {{.+}} [[G1_ADDR]],
+      // LAMBDA-DAG: [[G1_TGT:%.+]] = load {{.+}}, {{.+}} [[G1_REF]],
       // LAMBDA-64-DAG: [[SVAR_TGT:%.+]] = bitcast {{.+}} [[SVAR_ADDR]] to
       // LAMBDA-DAG: [[SFVAR_TGT:%.+]] = bitcast {{.+}} [[SFVAR_ADDR]] to
 
@@ -136,7 +135,7 @@
       // LAMBDA: [[SVAR_ADDR:%.+]] = alloca {{.+}},
       // LAMBDA: [[SFVAR_ADDR:%.+]] = alloca {{.+}},
       // LAMBDA: [[G_ADDR:%.+]] = alloca {{.+}},
-      // LAMBDA-64: [[G1_REF:%.+]] = alloca double*,
+      // LAMBDA: [[G1_REF:%.+]] = alloca double*,
       // loop variables
       // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
       // LAMBDA: {{.+}} = alloca i{{[0-9]+}},
@@ -152,8 +151,7 @@
 
       // LAMBDA-64-DAG: [[G_TGT:%.+]] = bitcast {{.+}} [[G_ADDR]] to
       // LAMBDA-32-DAG: [[G_TGT:%.+]] = load {{.+}}, {{.+}} [[G_ADDR]],
-      // LAMBDA-64-DAG: [[G1_TGT:%.+]] = load {{.+}}, {{.+}} [[G1_REF]],
-      // LAMBDA-32-DAG: [[G1_TGT:%.+]] = load {{.+}}, {{.+}} [[G1_ADDR]],
+      // LAMBDA-DAG: [[G1_TGT:%.+]] = load {{.+}}, {{.+}} [[G1_REF]],
       // LAMBDA-64-DAG: [[SVAR_TGT:%.+]] = bitcast {{.+}} [[SVAR_ADDR]] to
       // LAMBDA-DAG: [[SFVAR_TGT:%.+]] = bitcast {{.+}} [[SFVAR_ADDR]] to
 
@@ -239,6 +237,7 @@
 // CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_FLOAT_TY]]]*,
 // CHECK: [[VAR_ADDR:%.+]] = alloca [[S_FLOAT_TY]]*,
 // CHECK: [[SVAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[TMP_VAR_ADDR:%.+]] = alloca [[S_FLOAT_TY]]*,
 // skip loop variables
 // CHECK: {{.+}} = alloca i{{[0-9]+}},
 // CHECK: {{.+}} = alloca i{{[0-9]+}},
@@ -258,7 +257,7 @@
 // CHECK-64-DAG: [[TVAR_TGT:%.+]] = bitcast {{.+}} [[T_VAR_ADDR]] to
 // CHECK-DAG: [[VEC_TGT:%.+]] = load {{.+}}, {{.+}} [[VEC_ADDR]],
 // CHECK-DAG: [[S_ARR_TGT:%.+]] = load {{.+}}, {{.+}} [[S_ARR_ADDR]],
-// CHECK-DAG: [[VAR_TGT:%.+]] = load {{.+}}, {{.+}} [[VAR_ADDR]],
+// CHECK-DAG: [[VAR_TGT:%.+]] = load {{.+}}, {{.+}} [[TMP_VAR_ADDR]],
 // CHECK-64-DAG: [[SVAR_TGT:%.+]] = bitcast {{.+}} [[SVAR_ADDR]] to
 
 // the distribute loop
@@ -297,6 +296,7 @@
 // CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_FLOAT_TY]]]*,
 // CHECK: [[VAR_ADDR:%.+]] = alloca [[S_FLOAT_TY]]*,
 // CHECK: [[SVAR_ADDR:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[TMP_VAR_ADDR:%.+]] = alloca [[S_FLOAT_TY]]*,
 // skip loop variables
 // CHECK: {{.+}} = alloca i{{[0-9]+}},
 // CHECK: {{.+}} = alloca i{{[0-9]+}},
@@ -316,7 +316,7 @@
 // CHECK-64-DAG: [[TVAR_TGT:%.+]] = bitcast {{.+}} [[T_VAR_ADDR]] to
 // CHECK-DAG: [[VEC_TGT:%.+]] = load {{.+}}, {{.+}} [[VEC_ADDR]],
 // CHECK-DAG: [[S_ARR_TGT:%.+]] = load {{.+}}, {{.+}} [[S_ARR_ADDR]],
-// CHECK-DAG: [[VAR_TGT:%.+]] = load {{.+}}, {{.+}} [[VAR_ADDR]],
+// CHECK-DAG: [[VAR_TGT:%.+]] = load {{.+}}, {{.+}} [[TMP_VAR_ADDR]],
 // CHECK-64-DAG: [[SVAR_TGT:%.+]] = bitcast {{.+}} [[SVAR_ADDR]] to
 
 // the distribute loop
@@ -362,6 +362,7 @@
 // CHECK: [[T_VAR_ADDR1:%.+]] = alloca i{{[0-9]+}},
 // CHECK: [[S_ARR_ADDR1:%.+]] = alloca [2 x [[S_INT_TY]]]*,
 // CHECK: [[VAR_ADDR1:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: [[TMP_VAR_ADDR1:%.+]] = alloca [[S_INT_TY]]*,
 // skip loop variables
 // CHECK: {{.+}} = alloca i{{[0-9]+}},
 // CHECK: {{.+}} = alloca i{{[0-9]+}},
@@ -380,7 +381,7 @@
 // CHECK-64-DAG: [[T_VAR_TGT:%.+]] = bitcast {{.+}} [[T_VAR_ADDR1]] to
 // CHECK-DAG: [[VEC_TGT:%.+]] = load {{.+}}, {{.+}} [[VEC_ADDR1]],
 // CHECK-DAG: [[S_ARR_TGT:%.+]] = load {{.+}}, {{.+}} [[S_ARR_ADDR1]],
-// CHECK-DAG: [[VAR_TGT:%.+]] = load {{.+}}, {{.+}} [[VAR_ADDR1]],
+// CHECK-DAG: [[VAR_TGT:%.+]] = load {{.+}}, {{.+}} [[TMP_VAR_ADDR1]],
 
 // CHECK: call void @__kmpc_for_static_init_4(
 // CHECK: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[TPAR_OUTL:.+]] to
@@ -413,6 +414,7 @@
 // CHECK: [[T_VAR_ADDR1:%.+]] = alloca i{{[0-9]+}},
 // CHECK: [[S_ARR_ADDR1:%.+]] = alloca [2 x [[S_INT_TY]]]*,
 // CHECK: [[VAR_ADDR1:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: [[TMP_VAR_ADDR1:%.+]] = alloca [[S_INT_TY]]*,
 // skip loop variables
 // CHECK: {{.+}} = alloca i{{[0-9]+}},
 // CHECK: {{.+}} = alloca i{{[0-9]+}},
@@ -431,7 +433,7 @@
 // CHECK-64-DAG: [[T_VAR_TGT:%.+]] = bitcast {{.+}} [[T_VAR_ADDR1]] to
 // CHECK-DAG: [[VEC_TGT:%.+]] = load {{.+}}, {{.+}} [[VEC_ADDR1]],
 // CHECK-DAG: [[S_ARR_TGT:%.+]] = load {{.+}}, {{.+}} [[S_ARR_ADDR1]],
-// CHECK-DAG: [[VAR_TGT:%.+]] = load {{.+}}, {{.+}} [[VAR_ADDR1]],
+// CHECK-DAG: [[VAR_TGT:%.+]] = load {{.+}}, {{.+}} [[TMP_VAR_ADDR1]],
 
 // CHECK: call void @__kmpc_for_static_init_4(
 // skip body: code generation routine is same as distribute parallel for lastprivate
diff --git a/clang/test/OpenMP/target_teams_distribute_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_firstprivate_codegen.cpp
index 463a6af..05ea0e8 100644
--- a/clang/test/OpenMP/target_teams_distribute_simd_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_firstprivate_codegen.cpp
@@ -274,6 +274,7 @@
 // CHECK-DAG: [[TS_ARR_TE_PAR:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[TS_ARR_PRIV]],
 // CHECK-DAG: [[TVAR_TE_PAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TVAR_PRIV]],
 
+// CHECK: [[TVAR_TE_PAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** %
 // CHECK: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 4, {{.+}} @[[TOUTL1:.+]] to {{.+}}, [2 x i{{[0-9]+}}]* [[TVEC_TE_PAR]], i{{[0-9]+}} [[TT_VAR_TE_PAR]], [2 x [[S_INT_TY]]]* [[TS_ARR_TE_PAR]], [[S_INT_TY]]* [[TVAR_TE_PAR]])
 // CHECK: ret void
 
@@ -285,6 +286,7 @@
 // CHECK: [[T_VAR_ADDR:%.+]] = alloca i{{[0-9]+}},
 // CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
 // CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: [[TMP_VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
 // Skip temp vars for loop
 // CHECK: alloca i{{[0-9]+}},
 // CHECK: alloca i{{[0-9]+}},
@@ -328,7 +330,7 @@
 // CHECK-DAG: [[S_ARR_SRC]] = getelementptr {{.+}} [[S_ARR_SRC_COPY]],
 
 // firstprivate(var)
-// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load{{.+}} [[VAR_ADDR]],
+// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load{{.+}} [[TMP_VAR_ADDR]],
 // CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP2]])
 // CHECK-DAG: call void @{{.+}}({{.+}} [[VAR_PRIV]], {{.+}} [[VAR_ADDR_REF]], {{.+}} [[AGG_TMP2]])
 // CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP2]])
diff --git a/clang/test/OpenMP/target_teams_distribute_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_lastprivate_codegen.cpp
index fdf6d2f..6fae782 100644
--- a/clang/test/OpenMP/target_teams_distribute_simd_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_lastprivate_codegen.cpp
@@ -110,12 +110,13 @@
       // LAMBDA-64: [[G_IN_REF:%.+]] = bitcast i64* [[G_PRIVATE_ADDR]] to double*
       // LAMBDA-32: [[G_IN_REF:%.+]] = load double*, double** [[G_PRIVATE_ADDR]],
       // LAMBDA-64: [[G1_IN_REF:%.+]] = bitcast i64* [[G1_PRIVATE_ADDR]] to double*
-      // LAMBDA-64: store double* [[G1_IN_REF]], double** [[G1_IN_ADDR_REF:%.+]],
+      // LAMBDA-32: [[G1_IN_REF:%.+]] = load double*, double** [[G1_PRIVATE_ADDR]],
       // LAMBDA-64: [[SVAR_IN_REF:%.+]] = bitcast i64* [[SVAR_PRIVATE_ADDR]] to i32*
       // LAMBDA-64: [[SFVAR_IN_REF:%.+]] = bitcast i64* [[SFVAR_PRIVATE_ADDR]] to float*
       // LAMBDA-32: [[SFVAR_IN_REF:%.+]] = bitcast i32* [[SFVAR_PRIVATE_ADDR]] to float*
+      // LAMBDA-64: store double* [[G1_IN_REF]], double** [[G1_IN_ADDR_REF:%.+]],
       // LAMBDA-64: [[G1_IN_REF:%.+]] = load double*, double** [[G1_IN_ADDR_REF]],
-      // LAMBDA-32: [[G1_IN_REF:%.+]] = load double*, double** [[G1_PRIVATE_ADDR]],
+      // LAMBDA-32: [[G1_IN_REF:%.+]] = load double*, double** %
       // LAMBDA: store double* [[G1_PRIVATE]], double** [[TMP_G1_PRIVATE]],
       g = 1;
       g1 = 1;
@@ -245,9 +246,10 @@
 // CHECK: [[VEC_ADDR_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
 // CHECK-64: [[T_VAR_ADDR_REF:%.+]] = bitcast i64* [[T_VAR_ADDR]] to i32*
 // CHECK: [[S_ARR_ADDR_REF:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[S_ARR_ADDR]],
+// CHECK:  [[VAR_ADDR_REF:%.+]] = load {{.+}}, {{.+}} [[VAR_ADDR]],
 // CHECK-64: [[SVAR_ADDR_REF:%.+]] = bitcast i64* [[SVAR_ADDR]] to i32*
 // CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[OMP_IS_LAST]],
-// CHECK:  [[VAR_ADDR_REF:%.+]] = load {{.+}}, {{.+}} [[VAR_ADDR]],
+// CHECK:  [[TMP_VAR_ADDR_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %
 // the distribute loop
 // CHECK: call void @__kmpc_for_static_init_4(
 // assignment: vec[i] = t_var;
@@ -292,7 +294,7 @@
 // CHECK: br i1 [[CPY_IS_FINISHED]], label %[[S_ARR_COPY_DONE]], label %[[S_ARR_COPY_BLOCK]]
 // CHECK: [[S_ARR_COPY_DONE]]:
 // CHECK: [[TMP_VAL1:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
-// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_ADDR_REF]] to i8*
+// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAR_ADDR_REF]] to i8*
 // CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL1]] to i8*
 // CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
 // CHECK: [[SVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[S_VAR_PRIV]],
@@ -347,8 +349,9 @@
 // CHECK: [[VEC_ADDR_REF1:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR1]],
 // CHECK-64: [[T_VAR_ADDR_REF1:%.+]] = bitcast i64* [[T_VAR_ADDR1]] to i32*
 // CHECK: [[S_ARR_ADDR_REF1:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_ADDR1]],
-// CHECK-DAG: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[OMP_IS_LAST1]],
 // CHECK: [[VAR_ADDR1_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[VAR_ADDR1]],
+// CHECK-DAG: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[OMP_IS_LAST1]],
+// CHECK:  [[TMP_VAR_ADDR1_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** %
 // CHECK-DAG: store [[S_INT_TY]]* [[VAR_PRIV1]], [[S_INT_TY]]** [[TMP_PRIV1]],
 // CHECK: call void @__kmpc_for_static_init_4(
 // assignment: vec[i] = t_var;
@@ -394,7 +397,7 @@
 // CHECK: br i1 [[CPY_IS_FINISHED]], label %[[S_ARR_COPY_DONE]], label %[[S_ARR_COPY_BLOCK]]
 // CHECK: [[S_ARR_COPY_DONE]]:
 // CHECK: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
-// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_ADDR1_REF]] to i8*
+// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAR_ADDR1_REF]] to i8*
 // CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
 // CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
 // CHECK: ret void
diff --git a/clang/test/OpenMP/teams_distribute_firstprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_firstprivate_codegen.cpp
index 562e986..672ed8e 100644
--- a/clang/test/OpenMP/teams_distribute_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_firstprivate_codegen.cpp
@@ -277,6 +277,7 @@
 // CHECK-DAG: [[TS_ARR_TE_PAR:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[TS_ARR_PRIV]],
 // CHECK-DAG: [[TVAR_TE_PAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TVAR_PRIV]],
 
+// CHECK: [[TVAR_TE_PAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** %
 // CHECK: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 4, {{.+}} @[[TOUTL1:.+]] to {{.+}}, [2 x i{{[0-9]+}}]* [[TVEC_TE_PAR]], i{{[0-9]+}} [[TT_VAR_TE_PAR]], [2 x [[S_INT_TY]]]* [[TS_ARR_TE_PAR]], [[S_INT_TY]]* [[TVAR_TE_PAR]])
 // CHECK: ret void
 
@@ -288,6 +289,7 @@
 // CHECK: [[T_VAR_ADDR:%.+]] = alloca i{{[0-9]+}},
 // CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
 // CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: [[TMP_VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
 // Skip temp vars for loop
 // CHECK: alloca i{{[0-9]+}},
 // CHECK: alloca i{{[0-9]+}},
@@ -314,8 +316,8 @@
 // CHECK: [[S_ARR_ADDR_REF:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_ADDR]],
 
 // firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
-// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8* 
-// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8* 
+// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
+// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
 // CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
 
 // firstprivate(s_arr)
@@ -331,7 +333,7 @@
 // CHECK-DAG: [[S_ARR_SRC]] = getelementptr {{.+}} [[S_ARR_SRC_COPY]],
 
 // firstprivate(var)
-// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load{{.+}} [[VAR_ADDR]],
+// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load{{.+}} [[TMP_VAR_ADDR]],
 // CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP2]])
 // CHECK-DAG: call void @{{.+}}({{.+}} [[VAR_PRIV]], {{.+}} [[VAR_ADDR_REF]], {{.+}} [[AGG_TMP2]])
 // CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP2]])
diff --git a/clang/test/OpenMP/teams_distribute_lastprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_lastprivate_codegen.cpp
index 8832341..581bb1d 100644
--- a/clang/test/OpenMP/teams_distribute_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_lastprivate_codegen.cpp
@@ -101,9 +101,10 @@
 
       // init private variables
       // LAMBDA: [[G_IN_REF:%.+]] = load double*, double** [[G_PRIVATE_ADDR]],
+      // LAMBDA: [[G1_IN_REF:%.+]] = load double*, double** [[G1_PRIVATE_ADDR]],
       // LAMBDA: [[SVAR_IN_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_PRIVATE_ADDR]],
       // LAMBDA: [[SFVAR_IN_REF:%.+]] = load float*, float** [[SFVAR_PRIVATE_ADDR]],
-      // LAMBDA: [[G1_IN_REF:%.+]] = load double*, double** [[G1_PRIVATE_ADDR]],
+      // LAMBDA: [[G1_IN_REF:%.+]] = load double*, double** %
       // LAMBDA: store double* [[G1_PRIVATE]], double** [[TMP_G1_PRIVATE]],
       g = 1;
       g1 = 1;
@@ -232,9 +233,10 @@
 // CHECK: [[VEC_ADDR_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
 // CHECK: [[T_VAR_ADDR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR_ADDR]],
 // CHECK: [[S_ARR_ADDR_REF:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[S_ARR_ADDR]],
+// CHECK: [[VAR_ADDR_REF:%.+]] = load {{.+}}, {{.+}} [[VAR_ADDR]],
 // CHECK: [[SVAR_ADDR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_ADDR]],
 // CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[OMP_IS_LAST]],
-// CHECK:  [[VAR_ADDR_REF:%.+]] = load {{.+}}, {{.+}} [[VAR_ADDR]],
+// CHECK: [[TMP_VAR_ADDR_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %
 // the distribute loop
 // CHECK: call void @__kmpc_for_static_init_4(
 // assignment: vec[i] = t_var;
@@ -278,7 +280,7 @@
 // CHECK: br i1 [[CPY_IS_FINISHED]], label %[[S_ARR_COPY_DONE]], label %[[S_ARR_COPY_BLOCK]]
 // CHECK: [[S_ARR_COPY_DONE]]:
 // CHECK: [[TMP_VAL1:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
-// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_ADDR_REF]] to i8*
+// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAR_ADDR_REF]] to i8*
 // CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL1]] to i8*
 // CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
 // CHECK: [[SVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[S_VAR_PRIV]],
@@ -332,8 +334,9 @@
 // CHECK: [[VEC_ADDR_REF1:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR1]],
 // CHECK: [[T_VAR_ADDR_REF1:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR_ADDR1]],
 // CHECK: [[S_ARR_ADDR_REF1:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_ADDR1]],
-// CHECK-DAG: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[OMP_IS_LAST1]],
 // CHECK: [[VAR_ADDR1_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[VAR_ADDR1]],
+// CHECK-DAG: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[OMP_IS_LAST1]],
+// CHECK: [[TMP_VAR_ADDR1_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** %
 // CHECK-DAG: store [[S_INT_TY]]* [[VAR_PRIV1]], [[S_INT_TY]]** [[TMP_PRIV1]],
 // CHECK: call void @__kmpc_for_static_init_4(
 // assignment: vec[i] = t_var;
@@ -378,7 +381,7 @@
 // CHECK: br i1 [[CPY_IS_FINISHED]], label %[[S_ARR_COPY_DONE]], label %[[S_ARR_COPY_BLOCK]]
 // CHECK: [[S_ARR_COPY_DONE]]:
 // CHECK: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
-// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_ADDR1_REF]] to i8*
+// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAR_ADDR1_REF]] to i8*
 // CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
 // CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
 // CHECK: ret void
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_firstprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_firstprivate_codegen.cpp
index 7e161b1..14bccc0 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_firstprivate_codegen.cpp
@@ -367,6 +367,7 @@
 // CHECK-DAG: [[TS_ARR_TE_PAR:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[TS_ARR_PRIV]],
 // CHECK-DAG: [[TVAR_TE_PAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TVAR_PRIV]],
 
+// CHECK: [[TVAR_TE_PAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** %
 // CHECK: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 4, {{.+}} @[[TOUTL1:.+]] to {{.+}}, [2 x i{{[0-9]+}}]* [[TVEC_TE_PAR]], i{{[0-9]+}} [[TT_VAR_TE_PAR]], [2 x [[S_INT_TY]]]* [[TS_ARR_TE_PAR]], [[S_INT_TY]]* [[TVAR_TE_PAR]])
 // CHECK: ret void
 
@@ -378,6 +379,7 @@
 // CHECK: [[T_VAR_ADDR:%.+]] = alloca i{{[0-9]+}},
 // CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
 // CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: [[TMP_VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
 // Skip temp vars for loop
 // CHECK: alloca i{{[0-9]+}},
 // CHECK: alloca i{{[0-9]+}},
@@ -403,8 +405,8 @@
 // CHECK: [[S_ARR_ADDR_REF:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_ADDR]],
 
 // firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
-// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8* 
-// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8* 
+// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
+// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
 // CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
 
 // firstprivate(s_arr)
@@ -420,7 +422,7 @@
 // CHECK-DAG: [[S_ARR_SRC]] = getelementptr {{.+}} [[S_ARR_SRC_COPY]],
 
 // firstprivate(var)
-// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load{{.+}} [[VAR_ADDR]],
+// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load{{.+}} [[TMP_VAR_ADDR]],
 // CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP2]])
 // CHECK-DAG: call void @{{.+}}({{.+}} [[VAR_PRIV]], {{.+}} [[VAR_ADDR_REF]], {{.+}} [[AGG_TMP2]])
 // CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP2]])
@@ -439,6 +441,7 @@
 // CHECK: [[T_VAR_ADDR:%.+]] = alloca i{{[0-9]+}},
 // CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
 // CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: [[TMP_VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
 // Skip temp vars for loop
 // CHECK: alloca i{{[0-9]+}},
 // CHECK: alloca i{{[0-9]+}},
@@ -464,8 +467,8 @@
 // CHECK: [[S_ARR_ADDR_REF:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_ADDR]],
 
 // firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
-// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8* 
-// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8* 
+// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
+// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
 // CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
 
 // firstprivate(s_arr)
@@ -481,7 +484,7 @@
 // CHECK-DAG: [[S_ARR_SRC]] = getelementptr {{.+}} [[S_ARR_SRC_COPY]],
 
 // firstprivate(var)
-// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load{{.+}} [[VAR_ADDR]],
+// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load{{.+}} [[TMP_VAR_ADDR]],
 // CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP2]])
 // CHECK-DAG: call void @{{.+}}({{.+}} [[VAR_PRIV]], {{.+}} [[VAR_ADDR_REF]], {{.+}} [[AGG_TMP2]])
 // CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP2]])
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_lastprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_lastprivate_codegen.cpp
index 451dc05..bc0b210 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_lastprivate_codegen.cpp
@@ -101,9 +101,10 @@
 
       // init private variables
       // LAMBDA: [[G_IN_REF:%.+]] = load double*, double** [[G_PRIVATE_ADDR]],
+      // LAMBDA: [[G1_IN_REF:%.+]] = load double*, double** [[G1_PRIVATE_ADDR]],
       // LAMBDA: [[SVAR_IN_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_PRIVATE_ADDR]],
       // LAMBDA: [[SFVAR_IN_REF:%.+]] = load float*, float** [[SFVAR_PRIVATE_ADDR]],
-      // LAMBDA: [[G1_IN_REF:%.+]] = load double*, double** [[G1_PRIVATE_ADDR]],
+      // LAMBDA: [[G1_IN_REF:%.+]] = load double*, double** %
       // LAMBDA: store double* [[G1_PRIVATE]], double** [[TMP_G1_PRIVATE]],
       g = 1;
       g1 = 1;
@@ -156,9 +157,10 @@
 
       // init private variables
       // LAMBDA: [[G_IN_REF:%.+]] = load double*, double** [[G_PRIVATE_ADDR]],
+      // LAMBDA: [[G1_IN_REF:%.+]] = load double*, double** [[G1_PRIVATE_ADDR]],
       // LAMBDA: [[SVAR_IN_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_PRIVATE_ADDR]],
       // LAMBDA: [[SFVAR_IN_REF:%.+]] = load float*, float** [[SFVAR_PRIVATE_ADDR]],
-      // LAMBDA: [[G1_IN_REF:%.+]] = load double*, double** [[G1_PRIVATE_ADDR]],
+      // LAMBDA: [[G1_IN_REF:%.+]] = load double*, double** %
       // LAMBDA: store double* [[G1_PRIVATE]], double** [[TMP_G1_PRIVATE]],
 
       // LAMBDA: call {{.*}}void @__kmpc_for_static_init_4(
@@ -286,9 +288,10 @@
 // CHECK: [[VEC_ADDR_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
 // CHECK: [[T_VAR_ADDR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR_ADDR]],
 // CHECK: [[S_ARR_ADDR_REF:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[S_ARR_ADDR]],
+// CHECK:  [[VAR_ADDR_REF:%.+]] = load {{.+}}, {{.+}} [[VAR_ADDR]],
 // CHECK: [[SVAR_ADDR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_ADDR]],
 // CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[OMP_IS_LAST]],
-// CHECK:  [[VAR_ADDR_REF:%.+]] = load {{.+}}, {{.+}} [[VAR_ADDR]],
+// CHECK:  [[VAR_ADDR_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %
 // the distribute loop
 // CHECK: call void @__kmpc_for_static_init_4(
 // CHECK: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL:.+]] to
@@ -365,9 +368,10 @@
 // CHECK: [[VEC_ADDR_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
 // CHECK: [[T_VAR_ADDR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR_ADDR]],
 // CHECK: [[S_ARR_ADDR_REF:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[S_ARR_ADDR]],
+// CHECK:  [[VAR_ADDR_REF:%.+]] = load {{.+}}, {{.+}} [[VAR_ADDR]],
 // CHECK: [[SVAR_ADDR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_ADDR]],
 // CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[OMP_IS_LAST]],
-// CHECK:  [[VAR_ADDR_REF:%.+]] = load {{.+}}, {{.+}} [[VAR_ADDR]],
+// CHECK:  [[VAR_ADDR_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %
 // the distribute loop
 // CHECK: call void @__kmpc_for_static_init_4(
 
@@ -466,8 +470,9 @@
 // CHECK: [[VEC_ADDR_REF1:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR1]],
 // CHECK: [[T_VAR_ADDR_REF1:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR_ADDR1]],
 // CHECK: [[S_ARR_ADDR_REF1:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_ADDR1]],
-// CHECK-DAG: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[OMP_IS_LAST1]],
 // CHECK: [[VAR_ADDR1_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[VAR_ADDR1]],
+// CHECK-DAG: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[OMP_IS_LAST1]],
+// CHECK: [[VAR_ADDR1_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** %
 // CHECK-DAG: store [[S_INT_TY]]* [[VAR_PRIV1]], [[S_INT_TY]]** [[TMP_PRIV1]],
 
 // CHECK: call void @__kmpc_for_static_init_4(
@@ -543,8 +548,9 @@
 // CHECK: [[VEC_ADDR_REF1:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR1]],
 // CHECK: [[T_VAR_ADDR_REF1:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR_ADDR1]],
 // CHECK: [[S_ARR_ADDR_REF1:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_ADDR1]],
-// CHECK-DAG: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[OMP_IS_LAST1]],
 // CHECK: [[VAR_ADDR1_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[VAR_ADDR1]],
+// CHECK-DAG: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[OMP_IS_LAST1]],
+// CHECK: [[VAR_ADDR1_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** %
 // CHECK-DAG: store [[S_INT_TY]]* [[VAR_PRIV1]], [[S_INT_TY]]** [[TMP_PRIV1]],
 
 // CHECK: call void @__kmpc_for_static_init_4(
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_firstprivate_codegen.cpp
index 68b0016..6e8c0d1 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_firstprivate_codegen.cpp
@@ -370,6 +370,7 @@
 // CHECK-DAG: [[TS_ARR_TE_PAR:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[TS_ARR_PRIV]],
 // CHECK-DAG: [[TVAR_TE_PAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TVAR_PRIV]],
 
+// CHECK: [[TVAR_TE_PAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** %
 // CHECK: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 4, {{.+}} @[[TOUTL1:.+]] to {{.+}}, [2 x i{{[0-9]+}}]* [[TVEC_TE_PAR]], i{{[0-9]+}} [[TT_VAR_TE_PAR]], [2 x [[S_INT_TY]]]* [[TS_ARR_TE_PAR]], [[S_INT_TY]]* [[TVAR_TE_PAR]])
 // CHECK: ret void
 
@@ -381,6 +382,7 @@
 // CHECK: [[T_VAR_ADDR:%.+]] = alloca i{{[0-9]+}},
 // CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
 // CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: [[TMP_VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
 // Skip temp vars for loop
 // CHECK: alloca i{{[0-9]+}},
 // CHECK: alloca i{{[0-9]+}},
@@ -406,8 +408,8 @@
 // CHECK: [[S_ARR_ADDR_REF:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_ADDR]],
 
 // firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
-// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8* 
-// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8* 
+// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
+// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
 // CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
 
 // firstprivate(s_arr)
@@ -423,7 +425,7 @@
 // CHECK-DAG: [[S_ARR_SRC]] = getelementptr {{.+}} [[S_ARR_SRC_COPY]],
 
 // firstprivate(var)
-// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load{{.+}} [[VAR_ADDR]],
+// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load{{.+}} [[TMP_VAR_ADDR]],
 // CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP2]])
 // CHECK-DAG: call void @{{.+}}({{.+}} [[VAR_PRIV]], {{.+}} [[VAR_ADDR_REF]], {{.+}} [[AGG_TMP2]])
 // CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP2]])
@@ -467,8 +469,8 @@
 // CHECK: [[S_ARR_ADDR_REF:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_ADDR]],
 
 // firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
-// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8* 
-// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8* 
+// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
+// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
 // CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
 
 // firstprivate(s_arr)
@@ -484,7 +486,7 @@
 // CHECK-DAG: [[S_ARR_SRC]] = getelementptr {{.+}} [[S_ARR_SRC_COPY]],
 
 // firstprivate(var)
-// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load{{.+}} [[VAR_ADDR]],
+// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load{{.+}} [[TMP_VAR_ADDR]],
 // CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP2]])
 // CHECK-DAG: call void @{{.+}}({{.+}} [[VAR_PRIV]], {{.+}} [[VAR_ADDR_REF]], {{.+}} [[AGG_TMP2]])
 // CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP2]])
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_lastprivate_codegen.cpp
index c9141b0..4d21a05 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_lastprivate_codegen.cpp
@@ -101,9 +101,10 @@
 
       // init private variables
       // LAMBDA: [[G_IN_REF:%.+]] = load double*, double** [[G_PRIVATE_ADDR]],
+      // LAMBDA: [[G1_IN_REF:%.+]] = load double*, double** [[G1_PRIVATE_ADDR]],
       // LAMBDA: [[SVAR_IN_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_PRIVATE_ADDR]],
       // LAMBDA: [[SFVAR_IN_REF:%.+]] = load float*, float** [[SFVAR_PRIVATE_ADDR]],
-      // LAMBDA: [[G1_IN_REF:%.+]] = load double*, double** [[G1_PRIVATE_ADDR]],
+      // LAMBDA: [[G1_IN_REF:%.+]] = load double*, double** %
       // LAMBDA: store double* [[G1_PRIVATE]], double** [[TMP_G1_PRIVATE]],
       g = 1;
       g1 = 1;
@@ -157,9 +158,10 @@
 
       // init private variables
       // LAMBDA: [[G_IN_REF:%.+]] = load double*, double** [[G_PRIVATE_ADDR]],
+      // LAMBDA: [[G1_IN_REF:%.+]] = load double*, double** [[G1_PRIVATE_ADDR]],
       // LAMBDA: [[SVAR_IN_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_PRIVATE_ADDR]],
       // LAMBDA: [[SFVAR_IN_REF:%.+]] = load float*, float** [[SFVAR_PRIVATE_ADDR]],
-      // LAMBDA: [[G1_IN_REF:%.+]] = load double*, double** [[G1_PRIVATE_ADDR]],
+      // LAMBDA: [[G1_IN_REF:%.+]] = load double*, double** %
       // LAMBDA: store double* [[G1_PRIVATE]], double** [[TMP_G1_PRIVATE]],
 
       // LAMBDA: call {{.*}}void @__kmpc_for_static_init_4(
@@ -293,9 +295,10 @@
 // CHECK: [[VEC_ADDR_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
 // CHECK: [[T_VAR_ADDR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR_ADDR]],
 // CHECK: [[S_ARR_ADDR_REF:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[S_ARR_ADDR]],
+// CHECK: [[VAR_ADDR_REF:%.+]] = load {{.+}}, {{.+}} [[VAR_ADDR]],
 // CHECK: [[SVAR_ADDR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_ADDR]],
 // CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[OMP_IS_LAST]],
-// CHECK:  [[VAR_ADDR_REF:%.+]] = load {{.+}}, {{.+}} [[VAR_ADDR]],
+// CHECK:  [[VAR_ADDR_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %
 // the distribute loop
 // CHECK: call void @__kmpc_for_static_init_4(
 // CHECK: call void {{.*}} @__kmpc_fork_call({{.+}}, {{.+}}, {{.+}} @[[PAR_OUTL:.+]] to
@@ -372,9 +375,10 @@
 // CHECK: [[VEC_ADDR_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
 // CHECK: [[T_VAR_ADDR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR_ADDR]],
 // CHECK: [[S_ARR_ADDR_REF:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[S_ARR_ADDR]],
+// CHECK:  [[VAR_ADDR_REF:%.+]] = load {{.+}}, {{.+}} [[VAR_ADDR]],
 // CHECK: [[SVAR_ADDR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_ADDR]],
 // CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[OMP_IS_LAST]],
-// CHECK:  [[VAR_ADDR_REF:%.+]] = load {{.+}}, {{.+}} [[VAR_ADDR]],
+// CHECK:  [[VAR_ADDR_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %
 // the distribute loop
 // CHECK: call void @__kmpc_for_static_init_4(
 
@@ -476,8 +480,9 @@
 // CHECK: [[VEC_ADDR_REF1:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR1]],
 // CHECK: [[T_VAR_ADDR_REF1:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR_ADDR1]],
 // CHECK: [[S_ARR_ADDR_REF1:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_ADDR1]],
-// CHECK-DAG: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[OMP_IS_LAST1]],
 // CHECK: [[VAR_ADDR1_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[VAR_ADDR1]],
+// CHECK-DAG: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[OMP_IS_LAST1]],
+// CHECK: [[VAR_ADDR1_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** %
 // CHECK-DAG: store [[S_INT_TY]]* [[VAR_PRIV1]], [[S_INT_TY]]** [[TMP_PRIV1]],
 
 // CHECK: call void @__kmpc_for_static_init_4(
@@ -553,8 +558,9 @@
 // CHECK: [[VEC_ADDR_REF1:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR1]],
 // CHECK: [[T_VAR_ADDR_REF1:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR_ADDR1]],
 // CHECK: [[S_ARR_ADDR_REF1:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_ADDR1]],
-// CHECK-DAG: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[OMP_IS_LAST1]],
 // CHECK: [[VAR_ADDR1_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[VAR_ADDR1]],
+// CHECK-DAG: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[OMP_IS_LAST1]],
+// CHECK: [[VAR_ADDR1_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** %
 // CHECK-DAG: store [[S_INT_TY]]* [[VAR_PRIV1]], [[S_INT_TY]]** [[TMP_PRIV1]],
 
 // CHECK: call void @__kmpc_for_static_init_4(
diff --git a/clang/test/OpenMP/teams_distribute_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_simd_firstprivate_codegen.cpp
index ec0b006..f76e2c7 100644
--- a/clang/test/OpenMP/teams_distribute_simd_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_simd_firstprivate_codegen.cpp
@@ -277,6 +277,7 @@
 // CHECK-DAG: [[TS_ARR_TE_PAR:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[TS_ARR_PRIV]],
 // CHECK-DAG: [[TVAR_TE_PAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TVAR_PRIV]],
 
+// CHECK: [[TVAR_TE_PAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** %
 // CHECK: call void {{.+}} @__kmpc_fork_teams({{.+}}, i32 4, {{.+}} @[[TOUTL1:.+]] to {{.+}}, [2 x i{{[0-9]+}}]* [[TVEC_TE_PAR]], i{{[0-9]+}} [[TT_VAR_TE_PAR]], [2 x [[S_INT_TY]]]* [[TS_ARR_TE_PAR]], [[S_INT_TY]]* [[TVAR_TE_PAR]])
 // CHECK: ret void
 
@@ -288,6 +289,7 @@
 // CHECK: [[T_VAR_ADDR:%.+]] = alloca i{{[0-9]+}},
 // CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
 // CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: [[TMP_VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
 // Skip temp vars for loop
 // CHECK: alloca i{{[0-9]+}},
 // CHECK: alloca i{{[0-9]+}},
@@ -314,8 +316,8 @@
 // CHECK: [[S_ARR_ADDR_REF:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_ADDR]],
 
 // firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
-// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8* 
-// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8* 
+// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
+// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
 // CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
 
 // firstprivate(s_arr)
@@ -331,7 +333,7 @@
 // CHECK-DAG: [[S_ARR_SRC]] = getelementptr {{.+}} [[S_ARR_SRC_COPY]],
 
 // firstprivate(var)
-// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load{{.+}} [[VAR_ADDR]],
+// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load{{.+}} [[TMP_VAR_ADDR]],
 // CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP2]])
 // CHECK-DAG: call void @{{.+}}({{.+}} [[VAR_PRIV]], {{.+}} [[VAR_ADDR_REF]], {{.+}} [[AGG_TMP2]])
 // CHECK-DAG: call void @{{.+}}({{.+}} [[AGG_TMP2]])
diff --git a/clang/test/OpenMP/teams_distribute_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_simd_lastprivate_codegen.cpp
index 86cfd00..c7b507f 100644
--- a/clang/test/OpenMP/teams_distribute_simd_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_simd_lastprivate_codegen.cpp
@@ -101,9 +101,11 @@
 
       // init private variables
       // LAMBDA: [[G_IN_REF:%.+]] = load double*, double** [[G_PRIVATE_ADDR]],
+      // LAMBDA: [[G1_IN_REF:%.+]] = load double*, double** [[G1_PRIVATE_ADDR]],
       // LAMBDA: [[SVAR_IN_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_PRIVATE_ADDR]],
       // LAMBDA: [[SFVAR_IN_REF:%.+]] = load float*, float** [[SFVAR_PRIVATE_ADDR]],
-      // LAMBDA: [[G1_IN_REF:%.+]] = load double*, double** [[G1_PRIVATE_ADDR]],
+      // LAMBDA: store double* [[G1_IN_REF]], double** [[TMP:%.+]],
+      // LAMBDA: [[G1_IN_REF:%.+]] = load double*, double** [[TMP]],
       // LAMBDA: store double* [[G1_PRIVATE]], double** [[TMP_G1_PRIVATE]],
       g = 1;
       g1 = 1;
@@ -233,9 +235,11 @@
 // CHECK: [[VEC_ADDR_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
 // CHECK: [[T_VAR_ADDR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR_ADDR]],
 // CHECK: [[S_ARR_ADDR_REF:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[S_ARR_ADDR]],
+// CHECK: [[VAR_ADDR_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[VAR_ADDR]],
 // CHECK: [[SVAR_ADDR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SVAR_ADDR]],
+// CHECK: store [[S_FLOAT_TY]]* [[VAR_ADDR_REF]], [[S_FLOAT_TY]]** [[TMP:%.+]],
 // CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[OMP_IS_LAST]],
-// CHECK:  [[VAR_ADDR_REF:%.+]] = load {{.+}}, {{.+}} [[VAR_ADDR]],
+// CHECK:  [[VAR_ADDR_REF:%.+]] = load {{.+}}, {{.+}} [[TMP]],
 // the distribute loop
 // CHECK: call void @__kmpc_for_static_init_4(
 // assignment: vec[i] = t_var;
@@ -307,6 +311,7 @@
 // CHECK: [[T_VAR_ADDR1:%.+]] = alloca i{{[0-9]+}}*,
 // CHECK: [[S_ARR_ADDR1:%.+]] = alloca [2 x [[S_INT_TY]]]*,
 // CHECK: [[VAR_ADDR1:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: [[TMP1:%.+]] = alloca [[S_INT_TY]]*,
 // skip loop variables
 // CHECK: {{.+}} = alloca i{{[0-9]+}},
 // CHECK: {{.+}} = alloca i{{[0-9]+}},
@@ -333,8 +338,10 @@
 // CHECK: [[VEC_ADDR_REF1:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR1]],
 // CHECK: [[T_VAR_ADDR_REF1:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR_ADDR1]],
 // CHECK: [[S_ARR_ADDR_REF1:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_ADDR1]],
-// CHECK-DAG: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[OMP_IS_LAST1]],
 // CHECK: [[VAR_ADDR1_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[VAR_ADDR1]],
+// CHECK: store [[S_INT_TY]]* [[VAR_ADDR1_REF]], [[S_INT_TY]]** [[TMP1]],
+// CHECK-DAG: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[OMP_IS_LAST1]],
+// CHECK: [[VAR_ADDR1_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP1]],
 // CHECK-DAG: store [[S_INT_TY]]* [[VAR_PRIV1]], [[S_INT_TY]]** [[TMP_PRIV1]],
 // CHECK: call void @__kmpc_for_static_init_4(
 // assignment: vec[i] = t_var;