blob: bd59328aaf86672aa029b0cdcd3157302a5c8137 [file] [log] [blame]
Simon Pilgrimaab59b72016-03-31 20:26:30 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
4
5;
6; Unary shuffle indices from registers
7;
8
9define <4 x double> @var_shuffle_v4f64_v4f64_xxxx_i64(<4 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
10; ALL-LABEL: var_shuffle_v4f64_v4f64_xxxx_i64:
11; ALL: # BB#0:
12; ALL-NEXT: pushq %rbp
13; ALL-NEXT: movq %rsp, %rbp
14; ALL-NEXT: andq $-32, %rsp
15; ALL-NEXT: subq $64, %rsp
16; ALL-NEXT: vmovaps %ymm0, (%rsp)
17; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
18; ALL-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
19; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
20; ALL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
21; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
22; ALL-NEXT: movq %rbp, %rsp
23; ALL-NEXT: popq %rbp
24; ALL-NEXT: retq
25 %x0 = extractelement <4 x double> %x, i64 %i0
26 %x1 = extractelement <4 x double> %x, i64 %i1
27 %x2 = extractelement <4 x double> %x, i64 %i2
28 %x3 = extractelement <4 x double> %x, i64 %i3
29 %r0 = insertelement <4 x double> undef, double %x0, i32 0
30 %r1 = insertelement <4 x double> %r0, double %x1, i32 1
31 %r2 = insertelement <4 x double> %r1, double %x2, i32 2
32 %r3 = insertelement <4 x double> %r2, double %x3, i32 3
33 ret <4 x double> %r3
34}
35
36define <4 x double> @var_shuffle_v4f64_v4f64_uxx0_i64(<4 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
37; ALL-LABEL: var_shuffle_v4f64_v4f64_uxx0_i64:
38; ALL: # BB#0:
39; ALL-NEXT: pushq %rbp
40; ALL-NEXT: movq %rsp, %rbp
41; ALL-NEXT: andq $-32, %rsp
42; ALL-NEXT: subq $64, %rsp
43; ALL-NEXT: vmovaps %ymm0, (%rsp)
Simon Pilgrim941bd6b2016-08-24 18:07:53 +000044; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
45; ALL-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
Simon Pilgrimaab59b72016-03-31 20:26:30 +000046; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
47; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
48; ALL-NEXT: movq %rbp, %rsp
49; ALL-NEXT: popq %rbp
50; ALL-NEXT: retq
51 %x0 = extractelement <4 x double> %x, i64 %i0
52 %x1 = extractelement <4 x double> %x, i64 %i1
53 %x2 = extractelement <4 x double> %x, i64 %i2
54 %x3 = extractelement <4 x double> %x, i64 %i3
55 %r0 = insertelement <4 x double> undef, double undef, i32 0
56 %r1 = insertelement <4 x double> %r0, double %x1, i32 1
57 %r2 = insertelement <4 x double> %r1, double %x2, i32 2
58 %r3 = insertelement <4 x double> %r2, double 0.0, i32 3
59 ret <4 x double> %r3
60}
61
62define <4 x double> @var_shuffle_v4f64_v2f64_xxxx_i64(<2 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
63; ALL-LABEL: var_shuffle_v4f64_v2f64_xxxx_i64:
64; ALL: # BB#0:
65; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
66; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
67; ALL-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
68; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
69; ALL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
70; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
71; ALL-NEXT: retq
72 %x0 = extractelement <2 x double> %x, i64 %i0
73 %x1 = extractelement <2 x double> %x, i64 %i1
74 %x2 = extractelement <2 x double> %x, i64 %i2
75 %x3 = extractelement <2 x double> %x, i64 %i3
76 %r0 = insertelement <4 x double> undef, double %x0, i32 0
77 %r1 = insertelement <4 x double> %r0, double %x1, i32 1
78 %r2 = insertelement <4 x double> %r1, double %x2, i32 2
79 %r3 = insertelement <4 x double> %r2, double %x3, i32 3
80 ret <4 x double> %r3
81}
82
83define <4 x i64> @var_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
84; AVX1-LABEL: var_shuffle_v4i64_v4i64_xxxx_i64:
85; AVX1: # BB#0:
86; AVX1-NEXT: pushq %rbp
87; AVX1-NEXT: movq %rsp, %rbp
88; AVX1-NEXT: andq $-32, %rsp
89; AVX1-NEXT: subq $64, %rsp
90; AVX1-NEXT: vmovaps %ymm0, (%rsp)
91; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
92; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
93; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
94; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
95; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
96; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
97; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
98; AVX1-NEXT: movq %rbp, %rsp
99; AVX1-NEXT: popq %rbp
100; AVX1-NEXT: retq
101;
102; AVX2-LABEL: var_shuffle_v4i64_v4i64_xxxx_i64:
103; AVX2: # BB#0:
104; AVX2-NEXT: pushq %rbp
105; AVX2-NEXT: movq %rsp, %rbp
106; AVX2-NEXT: andq $-32, %rsp
107; AVX2-NEXT: subq $64, %rsp
108; AVX2-NEXT: vmovaps %ymm0, (%rsp)
109; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
110; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
111; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
112; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
113; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
114; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
115; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
116; AVX2-NEXT: movq %rbp, %rsp
117; AVX2-NEXT: popq %rbp
118; AVX2-NEXT: retq
119 %x0 = extractelement <4 x i64> %x, i64 %i0
120 %x1 = extractelement <4 x i64> %x, i64 %i1
121 %x2 = extractelement <4 x i64> %x, i64 %i2
122 %x3 = extractelement <4 x i64> %x, i64 %i3
123 %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
124 %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1
125 %r2 = insertelement <4 x i64> %r1, i64 %x2, i32 2
126 %r3 = insertelement <4 x i64> %r2, i64 %x3, i32 3
127 ret <4 x i64> %r3
128}
129
130define <4 x i64> @var_shuffle_v4i64_v4i64_xx00_i64(<4 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
131; AVX1-LABEL: var_shuffle_v4i64_v4i64_xx00_i64:
132; AVX1: # BB#0:
133; AVX1-NEXT: pushq %rbp
134; AVX1-NEXT: movq %rsp, %rbp
135; AVX1-NEXT: andq $-32, %rsp
136; AVX1-NEXT: subq $64, %rsp
137; AVX1-NEXT: vmovaps %ymm0, (%rsp)
138; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
139; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
140; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
141; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
142; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
143; AVX1-NEXT: movq %rbp, %rsp
144; AVX1-NEXT: popq %rbp
145; AVX1-NEXT: retq
146;
147; AVX2-LABEL: var_shuffle_v4i64_v4i64_xx00_i64:
148; AVX2: # BB#0:
149; AVX2-NEXT: pushq %rbp
150; AVX2-NEXT: movq %rsp, %rbp
151; AVX2-NEXT: andq $-32, %rsp
152; AVX2-NEXT: subq $64, %rsp
153; AVX2-NEXT: vmovaps %ymm0, (%rsp)
154; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
155; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
156; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
157; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
158; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
159; AVX2-NEXT: movq %rbp, %rsp
160; AVX2-NEXT: popq %rbp
161; AVX2-NEXT: retq
162 %x0 = extractelement <4 x i64> %x, i64 %i0
163 %x1 = extractelement <4 x i64> %x, i64 %i1
164 %x2 = extractelement <4 x i64> %x, i64 %i2
165 %x3 = extractelement <4 x i64> %x, i64 %i3
166 %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
167 %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1
168 %r2 = insertelement <4 x i64> %r1, i64 0, i32 2
169 %r3 = insertelement <4 x i64> %r2, i64 0, i32 3
170 ret <4 x i64> %r3
171}
172
173define <4 x i64> @var_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
174; AVX1-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64:
175; AVX1: # BB#0:
176; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
177; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
178; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
179; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
180; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
181; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
182; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
183; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
184; AVX1-NEXT: retq
185;
186; AVX2-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64:
187; AVX2: # BB#0:
188; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
189; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
190; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
191; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
192; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
193; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
194; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
195; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
196; AVX2-NEXT: retq
197 %x0 = extractelement <2 x i64> %x, i64 %i0
198 %x1 = extractelement <2 x i64> %x, i64 %i1
199 %x2 = extractelement <2 x i64> %x, i64 %i2
200 %x3 = extractelement <2 x i64> %x, i64 %i3
201 %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
202 %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1
203 %r2 = insertelement <4 x i64> %r1, i64 %x2, i32 2
204 %r3 = insertelement <4 x i64> %r2, i64 %x3, i32 3
205 ret <4 x i64> %r3
206}
207
208define <8 x float> @var_shuffle_v8f32_v8f32_xxxxxxxx_i32(<8 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7) nounwind {
209; AVX1-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32:
210; AVX1: # BB#0:
211; AVX1-NEXT: pushq %rbp
212; AVX1-NEXT: movq %rsp, %rbp
213; AVX1-NEXT: andq $-32, %rsp
214; AVX1-NEXT: subq $64, %rsp
215; AVX1-NEXT: movslq %edi, %rax
216; AVX1-NEXT: movslq %esi, %rsi
217; AVX1-NEXT: movslq %edx, %rdx
218; AVX1-NEXT: movslq %ecx, %r11
219; AVX1-NEXT: movslq %r8d, %r10
220; AVX1-NEXT: vmovaps %ymm0, (%rsp)
221; AVX1-NEXT: movslq %r9d, %r8
222; AVX1-NEXT: movslq 16(%rbp), %rdi
223; AVX1-NEXT: movslq 24(%rbp), %rcx
224; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
225; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
226; AVX1-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
227; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
228; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
229; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
230; AVX1-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
231; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
232; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1],xmm0[0],xmm3[3]
233; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
234; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
235; AVX1-NEXT: movq %rbp, %rsp
236; AVX1-NEXT: popq %rbp
237; AVX1-NEXT: retq
238;
239; AVX2-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32:
240; AVX2: # BB#0:
241; AVX2-NEXT: vmovd %edi, %xmm1
242; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm1
243; AVX2-NEXT: vmovd %esi, %xmm2
244; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm2
245; AVX2-NEXT: vmovd %edx, %xmm3
246; AVX2-NEXT: vpermps %ymm0, %ymm3, %ymm3
247; AVX2-NEXT: vmovd %ecx, %xmm4
248; AVX2-NEXT: vpermps %ymm0, %ymm4, %ymm4
249; AVX2-NEXT: vmovd %r8d, %xmm5
250; AVX2-NEXT: vpermps %ymm0, %ymm5, %ymm5
251; AVX2-NEXT: vmovd %r9d, %xmm6
252; AVX2-NEXT: vpermps %ymm0, %ymm6, %ymm6
253; AVX2-NEXT: vmovd {{.*#+}} xmm7 = mem[0],zero,zero,zero
254; AVX2-NEXT: vpermps %ymm0, %ymm7, %ymm7
255; AVX2-NEXT: vmovd {{.*#+}} xmm8 = mem[0],zero,zero,zero
256; AVX2-NEXT: vpermps %ymm0, %ymm8, %ymm0
257; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[2,3]
258; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm7[0],xmm5[3]
259; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[0]
260; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
261; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
262; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
263; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
264; AVX2-NEXT: retq
265 %x0 = extractelement <8 x float> %x, i32 %i0
266 %x1 = extractelement <8 x float> %x, i32 %i1
267 %x2 = extractelement <8 x float> %x, i32 %i2
268 %x3 = extractelement <8 x float> %x, i32 %i3
269 %x4 = extractelement <8 x float> %x, i32 %i4
270 %x5 = extractelement <8 x float> %x, i32 %i5
271 %x6 = extractelement <8 x float> %x, i32 %i6
272 %x7 = extractelement <8 x float> %x, i32 %i7
273 %r0 = insertelement <8 x float> undef, float %x0, i32 0
274 %r1 = insertelement <8 x float> %r0, float %x1, i32 1
275 %r2 = insertelement <8 x float> %r1, float %x2, i32 2
276 %r3 = insertelement <8 x float> %r2, float %x3, i32 3
277 %r4 = insertelement <8 x float> %r3, float %x4, i32 4
278 %r5 = insertelement <8 x float> %r4, float %x5, i32 5
279 %r6 = insertelement <8 x float> %r5, float %x6, i32 6
280 %r7 = insertelement <8 x float> %r6, float %x7, i32 7
281 ret <8 x float> %r7
282}
283
284define <8 x float> @var_shuffle_v8f32_v4f32_xxxxxxxx_i32(<4 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7) nounwind {
285; ALL-LABEL: var_shuffle_v8f32_v4f32_xxxxxxxx_i32:
286; ALL: # BB#0:
287; ALL-NEXT: movslq %edi, %rax
288; ALL-NEXT: movslq %esi, %rsi
289; ALL-NEXT: movslq %edx, %rdx
290; ALL-NEXT: movslq %ecx, %r11
291; ALL-NEXT: movslq %r8d, %r10
292; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
293; ALL-NEXT: movslq %r9d, %r8
294; ALL-NEXT: movslq {{[0-9]+}}(%rsp), %rdi
295; ALL-NEXT: movslq {{[0-9]+}}(%rsp), %rcx
296; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
297; ALL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
298; ALL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
299; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
300; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
301; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
302; ALL-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
303; ALL-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
304; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1],xmm0[0],xmm3[3]
305; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
306; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
307; ALL-NEXT: retq
308 %x0 = extractelement <4 x float> %x, i32 %i0
309 %x1 = extractelement <4 x float> %x, i32 %i1
310 %x2 = extractelement <4 x float> %x, i32 %i2
311 %x3 = extractelement <4 x float> %x, i32 %i3
312 %x4 = extractelement <4 x float> %x, i32 %i4
313 %x5 = extractelement <4 x float> %x, i32 %i5
314 %x6 = extractelement <4 x float> %x, i32 %i6
315 %x7 = extractelement <4 x float> %x, i32 %i7
316 %r0 = insertelement <8 x float> undef, float %x0, i32 0
317 %r1 = insertelement <8 x float> %r0, float %x1, i32 1
318 %r2 = insertelement <8 x float> %r1, float %x2, i32 2
319 %r3 = insertelement <8 x float> %r2, float %x3, i32 3
320 %r4 = insertelement <8 x float> %r3, float %x4, i32 4
321 %r5 = insertelement <8 x float> %r4, float %x5, i32 5
322 %r6 = insertelement <8 x float> %r5, float %x6, i32 6
323 %r7 = insertelement <8 x float> %r6, float %x7, i32 7
324 ret <8 x float> %r7
325}
326
327define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, i32 %i11, i32 %i12, i32 %i13, i32 %i14, i32 %i15) nounwind {
328; AVX1-LABEL: var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16:
329; AVX1: # BB#0:
330; AVX1-NEXT: pushq %rbp
331; AVX1-NEXT: movq %rsp, %rbp
332; AVX1-NEXT: andq $-32, %rsp
333; AVX1-NEXT: subq $64, %rsp
334; AVX1-NEXT: vmovaps %ymm0, (%rsp)
335; AVX1-NEXT: movslq 32(%rbp), %rax
336; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
337; AVX1-NEXT: vmovd %eax, %xmm0
338; AVX1-NEXT: movslq 40(%rbp), %rax
339; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
340; AVX1-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
341; AVX1-NEXT: movslq 48(%rbp), %rax
342; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
343; AVX1-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
344; AVX1-NEXT: movslq 56(%rbp), %rax
345; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
346; AVX1-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
347; AVX1-NEXT: movslq 64(%rbp), %rax
348; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
349; AVX1-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
350; AVX1-NEXT: movslq 72(%rbp), %rax
351; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
352; AVX1-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
353; AVX1-NEXT: movslq 80(%rbp), %rax
354; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
355; AVX1-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
356; AVX1-NEXT: movslq 88(%rbp), %rax
357; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
358; AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
359; AVX1-NEXT: movslq %edi, %rax
360; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
361; AVX1-NEXT: vmovd %eax, %xmm1
362; AVX1-NEXT: movslq %esi, %rax
363; AVX1-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm1, %xmm1
364; AVX1-NEXT: movslq %edx, %rax
365; AVX1-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm1, %xmm1
366; AVX1-NEXT: movslq %ecx, %rax
367; AVX1-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm1, %xmm1
368; AVX1-NEXT: movslq %r8d, %rax
369; AVX1-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm1, %xmm1
370; AVX1-NEXT: movslq %r9d, %rax
371; AVX1-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm1, %xmm1
372; AVX1-NEXT: movslq 16(%rbp), %rax
373; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
374; AVX1-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
375; AVX1-NEXT: movslq 24(%rbp), %rax
376; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
377; AVX1-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
378; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
379; AVX1-NEXT: movq %rbp, %rsp
380; AVX1-NEXT: popq %rbp
381; AVX1-NEXT: retq
382;
383; AVX2-LABEL: var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16:
384; AVX2: # BB#0:
385; AVX2-NEXT: pushq %rbp
386; AVX2-NEXT: movq %rsp, %rbp
387; AVX2-NEXT: andq $-32, %rsp
388; AVX2-NEXT: subq $64, %rsp
389; AVX2-NEXT: vmovaps %ymm0, (%rsp)
390; AVX2-NEXT: movslq 32(%rbp), %rax
391; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
392; AVX2-NEXT: vmovd %eax, %xmm0
393; AVX2-NEXT: movslq 40(%rbp), %rax
394; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
395; AVX2-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
396; AVX2-NEXT: movslq 48(%rbp), %rax
397; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
398; AVX2-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
399; AVX2-NEXT: movslq 56(%rbp), %rax
400; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
401; AVX2-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
402; AVX2-NEXT: movslq 64(%rbp), %rax
403; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
404; AVX2-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
405; AVX2-NEXT: movslq 72(%rbp), %rax
406; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
407; AVX2-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
408; AVX2-NEXT: movslq 80(%rbp), %rax
409; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
410; AVX2-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
411; AVX2-NEXT: movslq 88(%rbp), %rax
412; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
413; AVX2-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
414; AVX2-NEXT: movslq %edi, %rax
415; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
416; AVX2-NEXT: vmovd %eax, %xmm1
417; AVX2-NEXT: movslq %esi, %rax
418; AVX2-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm1, %xmm1
419; AVX2-NEXT: movslq %edx, %rax
420; AVX2-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm1, %xmm1
421; AVX2-NEXT: movslq %ecx, %rax
422; AVX2-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm1, %xmm1
423; AVX2-NEXT: movslq %r8d, %rax
424; AVX2-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm1, %xmm1
425; AVX2-NEXT: movslq %r9d, %rax
426; AVX2-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm1, %xmm1
427; AVX2-NEXT: movslq 16(%rbp), %rax
428; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
429; AVX2-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
430; AVX2-NEXT: movslq 24(%rbp), %rax
431; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
432; AVX2-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
433; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
434; AVX2-NEXT: movq %rbp, %rsp
435; AVX2-NEXT: popq %rbp
436; AVX2-NEXT: retq
437 %x0 = extractelement <16 x i16> %x, i32 %i0
438 %x1 = extractelement <16 x i16> %x, i32 %i1
439 %x2 = extractelement <16 x i16> %x, i32 %i2
440 %x3 = extractelement <16 x i16> %x, i32 %i3
441 %x4 = extractelement <16 x i16> %x, i32 %i4
442 %x5 = extractelement <16 x i16> %x, i32 %i5
443 %x6 = extractelement <16 x i16> %x, i32 %i6
444 %x7 = extractelement <16 x i16> %x, i32 %i7
445 %x8 = extractelement <16 x i16> %x, i32 %i8
446 %x9 = extractelement <16 x i16> %x, i32 %i9
447 %x10 = extractelement <16 x i16> %x, i32 %i10
448 %x11 = extractelement <16 x i16> %x, i32 %i11
449 %x12 = extractelement <16 x i16> %x, i32 %i12
450 %x13 = extractelement <16 x i16> %x, i32 %i13
451 %x14 = extractelement <16 x i16> %x, i32 %i14
452 %x15 = extractelement <16 x i16> %x, i32 %i15
453 %r0 = insertelement <16 x i16> undef, i16 %x0 , i32 0
454 %r1 = insertelement <16 x i16> %r0 , i16 %x1 , i32 1
455 %r2 = insertelement <16 x i16> %r1 , i16 %x2 , i32 2
456 %r3 = insertelement <16 x i16> %r2 , i16 %x3 , i32 3
457 %r4 = insertelement <16 x i16> %r3 , i16 %x4 , i32 4
458 %r5 = insertelement <16 x i16> %r4 , i16 %x5 , i32 5
459 %r6 = insertelement <16 x i16> %r5 , i16 %x6 , i32 6
460 %r7 = insertelement <16 x i16> %r6 , i16 %x7 , i32 7
461 %r8 = insertelement <16 x i16> %r7 , i16 %x8 , i32 8
462 %r9 = insertelement <16 x i16> %r8 , i16 %x9 , i32 9
463 %r10 = insertelement <16 x i16> %r9 , i16 %x10, i32 10
464 %r11 = insertelement <16 x i16> %r10, i16 %x11, i32 11
465 %r12 = insertelement <16 x i16> %r11, i16 %x12, i32 12
466 %r13 = insertelement <16 x i16> %r12, i16 %x13, i32 13
467 %r14 = insertelement <16 x i16> %r13, i16 %x14, i32 14
468 %r15 = insertelement <16 x i16> %r14, i16 %x15, i32 15
469 ret <16 x i16> %r15
470}
471
472define <16 x i16> @var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16(<8 x i16> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, i32 %i11, i32 %i12, i32 %i13, i32 %i14, i32 %i15) nounwind {
473; AVX1-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16:
474; AVX1: # BB#0:
475; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
476; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
477; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
478; AVX1-NEXT: vmovd %eax, %xmm0
479; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
480; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
481; AVX1-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
482; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
483; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
484; AVX1-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
485; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
486; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
487; AVX1-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
488; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
489; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
490; AVX1-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
491; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
492; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
493; AVX1-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
494; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
495; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
496; AVX1-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
497; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
498; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
499; AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
500; AVX1-NEXT: movslq %edi, %rax
501; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
502; AVX1-NEXT: vmovd %eax, %xmm1
503; AVX1-NEXT: movslq %esi, %rax
504; AVX1-NEXT: vpinsrw $1, -24(%rsp,%rax,2), %xmm1, %xmm1
505; AVX1-NEXT: movslq %edx, %rax
506; AVX1-NEXT: vpinsrw $2, -24(%rsp,%rax,2), %xmm1, %xmm1
507; AVX1-NEXT: movslq %ecx, %rax
508; AVX1-NEXT: vpinsrw $3, -24(%rsp,%rax,2), %xmm1, %xmm1
509; AVX1-NEXT: movslq %r8d, %rax
510; AVX1-NEXT: vpinsrw $4, -24(%rsp,%rax,2), %xmm1, %xmm1
511; AVX1-NEXT: movslq %r9d, %rax
512; AVX1-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm1, %xmm1
513; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
514; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
515; AVX1-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
516; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rax
517; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
518; AVX1-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
519; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
520; AVX1-NEXT: retq
521;
522; AVX2-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16:
523; AVX2: # BB#0:
524; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
525; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
526; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
527; AVX2-NEXT: vmovd %eax, %xmm0
528; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
529; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
530; AVX2-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
531; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
532; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
533; AVX2-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
534; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
535; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
536; AVX2-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
537; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
538; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
539; AVX2-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
540; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
541; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
542; AVX2-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
543; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
544; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
545; AVX2-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
546; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
547; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
548; AVX2-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
549; AVX2-NEXT: movslq %edi, %rax
550; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
551; AVX2-NEXT: vmovd %eax, %xmm1
552; AVX2-NEXT: movslq %esi, %rax
553; AVX2-NEXT: vpinsrw $1, -24(%rsp,%rax,2), %xmm1, %xmm1
554; AVX2-NEXT: movslq %edx, %rax
555; AVX2-NEXT: vpinsrw $2, -24(%rsp,%rax,2), %xmm1, %xmm1
556; AVX2-NEXT: movslq %ecx, %rax
557; AVX2-NEXT: vpinsrw $3, -24(%rsp,%rax,2), %xmm1, %xmm1
558; AVX2-NEXT: movslq %r8d, %rax
559; AVX2-NEXT: vpinsrw $4, -24(%rsp,%rax,2), %xmm1, %xmm1
560; AVX2-NEXT: movslq %r9d, %rax
561; AVX2-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm1, %xmm1
562; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
563; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
564; AVX2-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
565; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rax
566; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
567; AVX2-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
568; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
569; AVX2-NEXT: retq
570 %x0 = extractelement <8 x i16> %x, i32 %i0
571 %x1 = extractelement <8 x i16> %x, i32 %i1
572 %x2 = extractelement <8 x i16> %x, i32 %i2
573 %x3 = extractelement <8 x i16> %x, i32 %i3
574 %x4 = extractelement <8 x i16> %x, i32 %i4
575 %x5 = extractelement <8 x i16> %x, i32 %i5
576 %x6 = extractelement <8 x i16> %x, i32 %i6
577 %x7 = extractelement <8 x i16> %x, i32 %i7
578 %x8 = extractelement <8 x i16> %x, i32 %i8
579 %x9 = extractelement <8 x i16> %x, i32 %i9
580 %x10 = extractelement <8 x i16> %x, i32 %i10
581 %x11 = extractelement <8 x i16> %x, i32 %i11
582 %x12 = extractelement <8 x i16> %x, i32 %i12
583 %x13 = extractelement <8 x i16> %x, i32 %i13
584 %x14 = extractelement <8 x i16> %x, i32 %i14
585 %x15 = extractelement <8 x i16> %x, i32 %i15
586 %r0 = insertelement <16 x i16> undef, i16 %x0 , i32 0
587 %r1 = insertelement <16 x i16> %r0 , i16 %x1 , i32 1
588 %r2 = insertelement <16 x i16> %r1 , i16 %x2 , i32 2
589 %r3 = insertelement <16 x i16> %r2 , i16 %x3 , i32 3
590 %r4 = insertelement <16 x i16> %r3 , i16 %x4 , i32 4
591 %r5 = insertelement <16 x i16> %r4 , i16 %x5 , i32 5
592 %r6 = insertelement <16 x i16> %r5 , i16 %x6 , i32 6
593 %r7 = insertelement <16 x i16> %r6 , i16 %x7 , i32 7
594 %r8 = insertelement <16 x i16> %r7 , i16 %x8 , i32 8
595 %r9 = insertelement <16 x i16> %r8 , i16 %x9 , i32 9
596 %r10 = insertelement <16 x i16> %r9 , i16 %x10, i32 10
597 %r11 = insertelement <16 x i16> %r10, i16 %x11, i32 11
598 %r12 = insertelement <16 x i16> %r11, i16 %x12, i32 12
599 %r13 = insertelement <16 x i16> %r12, i16 %x13, i32 13
600 %r14 = insertelement <16 x i16> %r13, i16 %x14, i32 14
601 %r15 = insertelement <16 x i16> %r14, i16 %x15, i32 15
602 ret <16 x i16> %r15
603}
604
605;
606; Unary shuffle indices from memory
607;
608
609define <4 x i64> @mem_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64* %i) nounwind {
610; AVX1-LABEL: mem_shuffle_v4i64_v4i64_xxxx_i64:
611; AVX1: # BB#0:
612; AVX1-NEXT: pushq %rbp
613; AVX1-NEXT: movq %rsp, %rbp
614; AVX1-NEXT: andq $-32, %rsp
615; AVX1-NEXT: subq $64, %rsp
616; AVX1-NEXT: movq (%rdi), %rax
617; AVX1-NEXT: movq 8(%rdi), %rcx
618; AVX1-NEXT: movq 16(%rdi), %rdx
619; AVX1-NEXT: movq 24(%rdi), %rsi
620; AVX1-NEXT: vmovaps %ymm0, (%rsp)
621; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
622; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
623; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
624; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
625; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
626; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
627; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
628; AVX1-NEXT: movq %rbp, %rsp
629; AVX1-NEXT: popq %rbp
630; AVX1-NEXT: retq
631;
632; AVX2-LABEL: mem_shuffle_v4i64_v4i64_xxxx_i64:
633; AVX2: # BB#0:
634; AVX2-NEXT: pushq %rbp
635; AVX2-NEXT: movq %rsp, %rbp
636; AVX2-NEXT: andq $-32, %rsp
637; AVX2-NEXT: subq $64, %rsp
638; AVX2-NEXT: movq (%rdi), %rax
639; AVX2-NEXT: movq 8(%rdi), %rcx
640; AVX2-NEXT: movq 16(%rdi), %rdx
641; AVX2-NEXT: movq 24(%rdi), %rsi
642; AVX2-NEXT: vmovaps %ymm0, (%rsp)
643; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
644; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
645; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
646; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
647; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
648; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
649; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
650; AVX2-NEXT: movq %rbp, %rsp
651; AVX2-NEXT: popq %rbp
652; AVX2-NEXT: retq
653 %p0 = getelementptr inbounds i64, i64* %i, i32 0
654 %p1 = getelementptr inbounds i64, i64* %i, i32 1
655 %p2 = getelementptr inbounds i64, i64* %i, i32 2
656 %p3 = getelementptr inbounds i64, i64* %i, i32 3
657 %i0 = load i64, i64* %p0, align 4
658 %i1 = load i64, i64* %p1, align 4
659 %i2 = load i64, i64* %p2, align 4
660 %i3 = load i64, i64* %p3, align 4
661 %x0 = extractelement <4 x i64> %x, i64 %i0
662 %x1 = extractelement <4 x i64> %x, i64 %i1
663 %x2 = extractelement <4 x i64> %x, i64 %i2
664 %x3 = extractelement <4 x i64> %x, i64 %i3
665 %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
666 %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1
667 %r2 = insertelement <4 x i64> %r1, i64 %x2, i32 2
668 %r3 = insertelement <4 x i64> %r2, i64 %x3, i32 3
669 ret <4 x i64> %r3
670}
671
672define <4 x i64> @mem_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64* %i) nounwind {
673; AVX1-LABEL: mem_shuffle_v4i64_v2i64_xxxx_i64:
674; AVX1: # BB#0:
675; AVX1-NEXT: movq (%rdi), %rax
676; AVX1-NEXT: movq 8(%rdi), %rcx
677; AVX1-NEXT: movq 16(%rdi), %rdx
678; AVX1-NEXT: movq 24(%rdi), %rsi
679; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
680; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
681; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
682; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
683; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
684; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
685; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
686; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
687; AVX1-NEXT: retq
688;
689; AVX2-LABEL: mem_shuffle_v4i64_v2i64_xxxx_i64:
690; AVX2: # BB#0:
691; AVX2-NEXT: movq (%rdi), %rax
692; AVX2-NEXT: movq 8(%rdi), %rcx
693; AVX2-NEXT: movq 16(%rdi), %rdx
694; AVX2-NEXT: movq 24(%rdi), %rsi
695; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
696; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
697; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
698; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
699; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
700; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
701; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
702; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
703; AVX2-NEXT: retq
704 %p0 = getelementptr inbounds i64, i64* %i, i32 0
705 %p1 = getelementptr inbounds i64, i64* %i, i32 1
706 %p2 = getelementptr inbounds i64, i64* %i, i32 2
707 %p3 = getelementptr inbounds i64, i64* %i, i32 3
708 %i0 = load i64, i64* %p0, align 4
709 %i1 = load i64, i64* %p1, align 4
710 %i2 = load i64, i64* %p2, align 4
711 %i3 = load i64, i64* %p3, align 4
712 %x0 = extractelement <2 x i64> %x, i64 %i0
713 %x1 = extractelement <2 x i64> %x, i64 %i1
714 %x2 = extractelement <2 x i64> %x, i64 %i2
715 %x3 = extractelement <2 x i64> %x, i64 %i3
716 %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
717 %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1
718 %r2 = insertelement <4 x i64> %r1, i64 %x2, i32 2
719 %r3 = insertelement <4 x i64> %r2, i64 %x3, i32 3
720 ret <4 x i64> %r3
721}