blob: 24f5de30212e61f35ee2b6213313afc8a9e49a69 [file] [log] [blame]
Simon Pilgrimcf5352d2016-02-26 08:52:29 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+f16c | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512
5
6;
7; Half to Float
8;
9
10define float @cvt_i16_to_f32(i16 %a0) {
11; ALL-LABEL: cvt_i16_to_f32:
12; ALL: # BB#0:
13; ALL-NEXT: movswl %di, %eax
14; ALL-NEXT: vmovd %eax, %xmm0
15; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
16; ALL-NEXT: retq
17 %1 = bitcast i16 %a0 to half
18 %2 = fpext half %1 to float
19 ret float %2
20}
21
22define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) {
23; ALL-LABEL: cvt_4i16_to_4f32:
24; ALL: # BB#0:
25; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
26; ALL-NEXT: vmovq %xmm0, %rax
27; ALL-NEXT: movq %rax, %rcx
28; ALL-NEXT: movq %rax, %rdx
29; ALL-NEXT: movswl %ax, %esi
30; ALL-NEXT: shrl $16, %eax
31; ALL-NEXT: shrq $32, %rcx
32; ALL-NEXT: shrq $48, %rdx
33; ALL-NEXT: movswl %dx, %edx
34; ALL-NEXT: vmovd %edx, %xmm0
35; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
36; ALL-NEXT: movswl %cx, %ecx
37; ALL-NEXT: vmovd %ecx, %xmm1
38; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
39; ALL-NEXT: cwtl
40; ALL-NEXT: vmovd %eax, %xmm2
41; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
42; ALL-NEXT: vmovd %esi, %xmm3
43; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
44; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
45; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
46; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
47; ALL-NEXT: retq
48 %1 = bitcast <4 x i16> %a0 to <4 x half>
49 %2 = fpext <4 x half> %1 to <4 x float>
50 ret <4 x float> %2
51}
52
53define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) {
54; ALL-LABEL: cvt_8i16_to_4f32:
55; ALL: # BB#0:
56; ALL-NEXT: vmovq %xmm0, %rax
57; ALL-NEXT: movq %rax, %rcx
58; ALL-NEXT: movq %rax, %rdx
59; ALL-NEXT: movswl %ax, %esi
60; ALL-NEXT: shrl $16, %eax
61; ALL-NEXT: shrq $32, %rcx
62; ALL-NEXT: shrq $48, %rdx
63; ALL-NEXT: movswl %dx, %edx
64; ALL-NEXT: vmovd %edx, %xmm0
65; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
66; ALL-NEXT: movswl %cx, %ecx
67; ALL-NEXT: vmovd %ecx, %xmm1
68; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
69; ALL-NEXT: cwtl
70; ALL-NEXT: vmovd %eax, %xmm2
71; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
72; ALL-NEXT: vmovd %esi, %xmm3
73; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
74; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
75; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
76; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
77; ALL-NEXT: retq
78 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
79 %2 = bitcast <4 x i16> %1 to <4 x half>
80 %3 = fpext <4 x half> %2 to <4 x float>
81 ret <4 x float> %3
82}
83
84define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) {
85; AVX1-LABEL: cvt_8i16_to_8f32:
86; AVX1: # BB#0:
87; AVX1-NEXT: vpextrq $1, %xmm0, %rdx
88; AVX1-NEXT: movq %rdx, %r8
89; AVX1-NEXT: movq %rdx, %r10
90; AVX1-NEXT: movswl %dx, %r9d
91; AVX1-NEXT: shrl $16, %edx
92; AVX1-NEXT: shrq $32, %r8
93; AVX1-NEXT: shrq $48, %r10
94; AVX1-NEXT: vmovq %xmm0, %rdi
95; AVX1-NEXT: movq %rdi, %rax
96; AVX1-NEXT: movq %rdi, %rsi
97; AVX1-NEXT: movswl %di, %ecx
98; AVX1-NEXT: shrl $16, %edi
99; AVX1-NEXT: shrq $32, %rax
100; AVX1-NEXT: shrq $48, %rsi
101; AVX1-NEXT: movswl %si, %esi
102; AVX1-NEXT: vmovd %esi, %xmm0
103; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
104; AVX1-NEXT: cwtl
105; AVX1-NEXT: vmovd %eax, %xmm1
106; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
107; AVX1-NEXT: movswl %di, %eax
108; AVX1-NEXT: vmovd %eax, %xmm2
109; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
110; AVX1-NEXT: vmovd %ecx, %xmm3
111; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
112; AVX1-NEXT: movswl %r10w, %eax
113; AVX1-NEXT: vmovd %eax, %xmm4
114; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4
115; AVX1-NEXT: movswl %r8w, %eax
116; AVX1-NEXT: vmovd %eax, %xmm5
117; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
118; AVX1-NEXT: movswl %dx, %eax
119; AVX1-NEXT: vmovd %eax, %xmm6
120; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
121; AVX1-NEXT: vmovd %r9d, %xmm7
122; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
123; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
124; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
125; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
126; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
127; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
128; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
129; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
130; AVX1-NEXT: retq
131;
132; AVX2-LABEL: cvt_8i16_to_8f32:
133; AVX2: # BB#0:
134; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
135; AVX2-NEXT: movq %rdx, %r8
136; AVX2-NEXT: movq %rdx, %r10
137; AVX2-NEXT: movswl %dx, %r9d
138; AVX2-NEXT: shrl $16, %edx
139; AVX2-NEXT: shrq $32, %r8
140; AVX2-NEXT: shrq $48, %r10
141; AVX2-NEXT: vmovq %xmm0, %rdi
142; AVX2-NEXT: movq %rdi, %rax
143; AVX2-NEXT: movq %rdi, %rsi
144; AVX2-NEXT: movswl %di, %ecx
145; AVX2-NEXT: shrl $16, %edi
146; AVX2-NEXT: shrq $32, %rax
147; AVX2-NEXT: shrq $48, %rsi
148; AVX2-NEXT: movswl %si, %esi
149; AVX2-NEXT: vmovd %esi, %xmm0
150; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
151; AVX2-NEXT: cwtl
152; AVX2-NEXT: vmovd %eax, %xmm1
153; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
154; AVX2-NEXT: movswl %di, %eax
155; AVX2-NEXT: vmovd %eax, %xmm2
156; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
157; AVX2-NEXT: vmovd %ecx, %xmm3
158; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
159; AVX2-NEXT: movswl %r10w, %eax
160; AVX2-NEXT: vmovd %eax, %xmm4
161; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4
162; AVX2-NEXT: movswl %r8w, %eax
163; AVX2-NEXT: vmovd %eax, %xmm5
164; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
165; AVX2-NEXT: movswl %dx, %eax
166; AVX2-NEXT: vmovd %eax, %xmm6
167; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
168; AVX2-NEXT: vmovd %r9d, %xmm7
169; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
170; AVX2-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
171; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
172; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
173; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
174; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
175; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
176; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
177; AVX2-NEXT: retq
178;
179; AVX512-LABEL: cvt_8i16_to_8f32:
180; AVX512: # BB#0:
181; AVX512-NEXT: vpextrq $1, %xmm0, %rdx
182; AVX512-NEXT: movq %rdx, %r8
183; AVX512-NEXT: movq %rdx, %r10
184; AVX512-NEXT: movswl %dx, %r9d
185; AVX512-NEXT: shrl $16, %edx
186; AVX512-NEXT: shrq $32, %r8
187; AVX512-NEXT: shrq $48, %r10
188; AVX512-NEXT: vmovq %xmm0, %rdi
189; AVX512-NEXT: movq %rdi, %rax
190; AVX512-NEXT: movq %rdi, %rsi
191; AVX512-NEXT: movswl %di, %ecx
192; AVX512-NEXT: shrl $16, %edi
193; AVX512-NEXT: shrq $32, %rax
194; AVX512-NEXT: shrq $48, %rsi
195; AVX512-NEXT: movswl %si, %esi
196; AVX512-NEXT: vmovd %esi, %xmm0
197; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
198; AVX512-NEXT: cwtl
199; AVX512-NEXT: vmovd %eax, %xmm1
200; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
201; AVX512-NEXT: movswl %di, %eax
202; AVX512-NEXT: vmovd %eax, %xmm2
203; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
204; AVX512-NEXT: vmovd %ecx, %xmm3
205; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
206; AVX512-NEXT: movswl %r10w, %eax
207; AVX512-NEXT: vmovd %eax, %xmm4
208; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
209; AVX512-NEXT: movswl %r8w, %eax
210; AVX512-NEXT: vmovd %eax, %xmm5
211; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
212; AVX512-NEXT: movswl %dx, %eax
213; AVX512-NEXT: vmovd %eax, %xmm6
214; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
215; AVX512-NEXT: vmovd %r9d, %xmm7
216; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
217; AVX512-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
218; AVX512-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
219; AVX512-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
220; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
221; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
222; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
223; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
224; AVX512-NEXT: retq
225 %1 = bitcast <8 x i16> %a0 to <8 x half>
226 %2 = fpext <8 x half> %1 to <8 x float>
227 ret <8 x float> %2
228}
229
230define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) {
231; AVX1-LABEL: cvt_16i16_to_16f32:
232; AVX1: # BB#0:
233; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
234; AVX1-NEXT: vmovq %xmm4, %rax
235; AVX1-NEXT: movq %rax, %rcx
236; AVX1-NEXT: shrq $48, %rcx
237; AVX1-NEXT: movswl %cx, %ecx
238; AVX1-NEXT: vmovd %ecx, %xmm8
239; AVX1-NEXT: movq %rax, %rcx
240; AVX1-NEXT: shrq $32, %rcx
241; AVX1-NEXT: movswl %cx, %ecx
242; AVX1-NEXT: vmovd %ecx, %xmm9
243; AVX1-NEXT: movswl %ax, %ecx
244; AVX1-NEXT: shrl $16, %eax
245; AVX1-NEXT: cwtl
246; AVX1-NEXT: vmovd %eax, %xmm10
247; AVX1-NEXT: vpextrq $1, %xmm4, %rax
248; AVX1-NEXT: vmovd %ecx, %xmm11
249; AVX1-NEXT: movq %rax, %rcx
250; AVX1-NEXT: shrq $48, %rcx
251; AVX1-NEXT: movswl %cx, %ecx
252; AVX1-NEXT: vmovd %ecx, %xmm12
253; AVX1-NEXT: movq %rax, %rcx
254; AVX1-NEXT: shrq $32, %rcx
255; AVX1-NEXT: movswl %cx, %ecx
256; AVX1-NEXT: vmovd %ecx, %xmm13
257; AVX1-NEXT: movswl %ax, %ecx
258; AVX1-NEXT: shrl $16, %eax
259; AVX1-NEXT: cwtl
260; AVX1-NEXT: vmovd %eax, %xmm14
261; AVX1-NEXT: vmovq %xmm0, %rax
262; AVX1-NEXT: vmovd %ecx, %xmm15
263; AVX1-NEXT: movq %rax, %rcx
264; AVX1-NEXT: shrq $48, %rcx
265; AVX1-NEXT: movswl %cx, %ecx
266; AVX1-NEXT: vmovd %ecx, %xmm2
267; AVX1-NEXT: movq %rax, %rcx
268; AVX1-NEXT: shrq $32, %rcx
269; AVX1-NEXT: movswl %cx, %ecx
270; AVX1-NEXT: vmovd %ecx, %xmm3
271; AVX1-NEXT: movswl %ax, %ecx
272; AVX1-NEXT: shrl $16, %eax
273; AVX1-NEXT: cwtl
274; AVX1-NEXT: vmovd %eax, %xmm4
275; AVX1-NEXT: vpextrq $1, %xmm0, %rax
276; AVX1-NEXT: vmovd %ecx, %xmm0
277; AVX1-NEXT: movq %rax, %rcx
278; AVX1-NEXT: shrq $48, %rcx
279; AVX1-NEXT: movswl %cx, %ecx
280; AVX1-NEXT: vmovd %ecx, %xmm5
281; AVX1-NEXT: movq %rax, %rcx
282; AVX1-NEXT: shrq $32, %rcx
283; AVX1-NEXT: movswl %cx, %ecx
284; AVX1-NEXT: vmovd %ecx, %xmm6
285; AVX1-NEXT: movl %eax, %ecx
286; AVX1-NEXT: shrl $16, %ecx
287; AVX1-NEXT: movswl %cx, %ecx
288; AVX1-NEXT: vmovd %ecx, %xmm7
289; AVX1-NEXT: cwtl
290; AVX1-NEXT: vmovd %eax, %xmm1
291; AVX1-NEXT: vcvtph2ps %xmm8, %xmm8
292; AVX1-NEXT: vcvtph2ps %xmm9, %xmm9
293; AVX1-NEXT: vcvtph2ps %xmm10, %xmm10
294; AVX1-NEXT: vcvtph2ps %xmm11, %xmm11
295; AVX1-NEXT: vcvtph2ps %xmm12, %xmm12
296; AVX1-NEXT: vcvtph2ps %xmm13, %xmm13
297; AVX1-NEXT: vcvtph2ps %xmm14, %xmm14
298; AVX1-NEXT: vcvtph2ps %xmm15, %xmm15
299; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
300; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
301; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4
302; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
303; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
304; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
305; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
306; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
307; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[2,3]
308; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
309; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
310; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3]
311; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
312; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
313; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
314; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[2,3]
315; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
316; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
317; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[2,3]
318; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
319; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
320; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
321; AVX1-NEXT: retq
322;
323; AVX2-LABEL: cvt_16i16_to_16f32:
324; AVX2: # BB#0:
325; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
326; AVX2-NEXT: vmovq %xmm4, %rax
327; AVX2-NEXT: movq %rax, %rcx
328; AVX2-NEXT: shrq $48, %rcx
329; AVX2-NEXT: movswl %cx, %ecx
330; AVX2-NEXT: vmovd %ecx, %xmm8
331; AVX2-NEXT: movq %rax, %rcx
332; AVX2-NEXT: shrq $32, %rcx
333; AVX2-NEXT: movswl %cx, %ecx
334; AVX2-NEXT: vmovd %ecx, %xmm9
335; AVX2-NEXT: movswl %ax, %ecx
336; AVX2-NEXT: shrl $16, %eax
337; AVX2-NEXT: cwtl
338; AVX2-NEXT: vmovd %eax, %xmm10
339; AVX2-NEXT: vpextrq $1, %xmm4, %rax
340; AVX2-NEXT: vmovd %ecx, %xmm11
341; AVX2-NEXT: movq %rax, %rcx
342; AVX2-NEXT: shrq $48, %rcx
343; AVX2-NEXT: movswl %cx, %ecx
344; AVX2-NEXT: vmovd %ecx, %xmm12
345; AVX2-NEXT: movq %rax, %rcx
346; AVX2-NEXT: shrq $32, %rcx
347; AVX2-NEXT: movswl %cx, %ecx
348; AVX2-NEXT: vmovd %ecx, %xmm13
349; AVX2-NEXT: movswl %ax, %ecx
350; AVX2-NEXT: shrl $16, %eax
351; AVX2-NEXT: cwtl
352; AVX2-NEXT: vmovd %eax, %xmm14
353; AVX2-NEXT: vmovq %xmm0, %rax
354; AVX2-NEXT: vmovd %ecx, %xmm15
355; AVX2-NEXT: movq %rax, %rcx
356; AVX2-NEXT: shrq $48, %rcx
357; AVX2-NEXT: movswl %cx, %ecx
358; AVX2-NEXT: vmovd %ecx, %xmm2
359; AVX2-NEXT: movq %rax, %rcx
360; AVX2-NEXT: shrq $32, %rcx
361; AVX2-NEXT: movswl %cx, %ecx
362; AVX2-NEXT: vmovd %ecx, %xmm3
363; AVX2-NEXT: movswl %ax, %ecx
364; AVX2-NEXT: shrl $16, %eax
365; AVX2-NEXT: cwtl
366; AVX2-NEXT: vmovd %eax, %xmm4
367; AVX2-NEXT: vpextrq $1, %xmm0, %rax
368; AVX2-NEXT: vmovd %ecx, %xmm0
369; AVX2-NEXT: movq %rax, %rcx
370; AVX2-NEXT: shrq $48, %rcx
371; AVX2-NEXT: movswl %cx, %ecx
372; AVX2-NEXT: vmovd %ecx, %xmm5
373; AVX2-NEXT: movq %rax, %rcx
374; AVX2-NEXT: shrq $32, %rcx
375; AVX2-NEXT: movswl %cx, %ecx
376; AVX2-NEXT: vmovd %ecx, %xmm6
377; AVX2-NEXT: movl %eax, %ecx
378; AVX2-NEXT: shrl $16, %ecx
379; AVX2-NEXT: movswl %cx, %ecx
380; AVX2-NEXT: vmovd %ecx, %xmm7
381; AVX2-NEXT: cwtl
382; AVX2-NEXT: vmovd %eax, %xmm1
383; AVX2-NEXT: vcvtph2ps %xmm8, %xmm8
384; AVX2-NEXT: vcvtph2ps %xmm9, %xmm9
385; AVX2-NEXT: vcvtph2ps %xmm10, %xmm10
386; AVX2-NEXT: vcvtph2ps %xmm11, %xmm11
387; AVX2-NEXT: vcvtph2ps %xmm12, %xmm12
388; AVX2-NEXT: vcvtph2ps %xmm13, %xmm13
389; AVX2-NEXT: vcvtph2ps %xmm14, %xmm14
390; AVX2-NEXT: vcvtph2ps %xmm15, %xmm15
391; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
392; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
393; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4
394; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
395; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
396; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
397; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
398; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
399; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[2,3]
400; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
401; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
402; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3]
403; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
404; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
405; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
406; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[2,3]
407; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
408; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
409; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0],xmm10[0],xmm11[2,3]
410; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
411; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
412; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
413; AVX2-NEXT: retq
414;
415; AVX512-LABEL: cvt_16i16_to_16f32:
416; AVX512: # BB#0:
417; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm10
418; AVX512-NEXT: vmovq %xmm0, %rax
419; AVX512-NEXT: movq %rax, %rcx
420; AVX512-NEXT: shrq $48, %rcx
421; AVX512-NEXT: movswl %cx, %ecx
422; AVX512-NEXT: vmovd %ecx, %xmm8
423; AVX512-NEXT: movq %rax, %rcx
424; AVX512-NEXT: shrq $32, %rcx
425; AVX512-NEXT: movswl %cx, %ecx
426; AVX512-NEXT: vmovd %ecx, %xmm9
427; AVX512-NEXT: movswl %ax, %ecx
428; AVX512-NEXT: shrl $16, %eax
429; AVX512-NEXT: cwtl
430; AVX512-NEXT: vmovd %eax, %xmm11
431; AVX512-NEXT: vpextrq $1, %xmm0, %rax
432; AVX512-NEXT: vmovd %ecx, %xmm12
433; AVX512-NEXT: movq %rax, %rcx
434; AVX512-NEXT: shrq $48, %rcx
435; AVX512-NEXT: movswl %cx, %ecx
436; AVX512-NEXT: vmovd %ecx, %xmm13
437; AVX512-NEXT: movq %rax, %rcx
438; AVX512-NEXT: shrq $32, %rcx
439; AVX512-NEXT: movswl %cx, %ecx
440; AVX512-NEXT: vmovd %ecx, %xmm14
441; AVX512-NEXT: movswl %ax, %ecx
442; AVX512-NEXT: shrl $16, %eax
443; AVX512-NEXT: cwtl
444; AVX512-NEXT: vmovd %eax, %xmm15
445; AVX512-NEXT: vmovq %xmm10, %rax
446; AVX512-NEXT: vmovd %ecx, %xmm2
447; AVX512-NEXT: movq %rax, %rcx
448; AVX512-NEXT: shrq $48, %rcx
449; AVX512-NEXT: movswl %cx, %ecx
450; AVX512-NEXT: vmovd %ecx, %xmm3
451; AVX512-NEXT: movq %rax, %rcx
452; AVX512-NEXT: shrq $32, %rcx
453; AVX512-NEXT: movswl %cx, %ecx
454; AVX512-NEXT: vmovd %ecx, %xmm1
455; AVX512-NEXT: movswl %ax, %ecx
456; AVX512-NEXT: shrl $16, %eax
457; AVX512-NEXT: cwtl
458; AVX512-NEXT: vmovd %eax, %xmm4
459; AVX512-NEXT: vpextrq $1, %xmm10, %rax
460; AVX512-NEXT: vmovd %ecx, %xmm10
461; AVX512-NEXT: movq %rax, %rcx
462; AVX512-NEXT: shrq $48, %rcx
463; AVX512-NEXT: movswl %cx, %ecx
464; AVX512-NEXT: vmovd %ecx, %xmm5
465; AVX512-NEXT: movq %rax, %rcx
466; AVX512-NEXT: shrq $32, %rcx
467; AVX512-NEXT: movswl %cx, %ecx
468; AVX512-NEXT: vmovd %ecx, %xmm6
469; AVX512-NEXT: movl %eax, %ecx
470; AVX512-NEXT: shrl $16, %ecx
471; AVX512-NEXT: movswl %cx, %ecx
472; AVX512-NEXT: vmovd %ecx, %xmm7
473; AVX512-NEXT: cwtl
474; AVX512-NEXT: vmovd %eax, %xmm0
475; AVX512-NEXT: vcvtph2ps %xmm8, %xmm8
476; AVX512-NEXT: vcvtph2ps %xmm9, %xmm9
477; AVX512-NEXT: vcvtph2ps %xmm11, %xmm11
478; AVX512-NEXT: vcvtph2ps %xmm12, %xmm12
479; AVX512-NEXT: vcvtph2ps %xmm13, %xmm13
480; AVX512-NEXT: vcvtph2ps %xmm14, %xmm14
481; AVX512-NEXT: vcvtph2ps %xmm15, %xmm15
482; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
483; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
484; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
485; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
486; AVX512-NEXT: vcvtph2ps %xmm10, %xmm10
487; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
488; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
489; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
490; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
491; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[2,3]
492; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0],xmm0[3]
493; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[0]
494; AVX512-NEXT: vinsertps {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[2,3]
495; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0,1],xmm1[0],xmm4[3]
496; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0]
497; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
498; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[2,3]
499; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3]
500; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0]
501; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3]
502; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
503; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
504; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
505; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
506; AVX512-NEXT: retq
507 %1 = bitcast <16 x i16> %a0 to <16 x half>
508 %2 = fpext <16 x half> %1 to <16 x float>
509 ret <16 x float> %2
510}
511
512;
513; Half to Float (Load)
514;
515
516define <4 x float> @load_cvt_4i16_to_4f32(<4 x i16>* %a0) {
517; ALL-LABEL: load_cvt_4i16_to_4f32:
518; ALL: # BB#0:
519; ALL-NEXT: movswl 6(%rdi), %eax
520; ALL-NEXT: vmovd %eax, %xmm0
521; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
522; ALL-NEXT: movswl 4(%rdi), %eax
523; ALL-NEXT: vmovd %eax, %xmm1
524; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
525; ALL-NEXT: movswl (%rdi), %eax
526; ALL-NEXT: vmovd %eax, %xmm2
527; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
528; ALL-NEXT: movswl 2(%rdi), %eax
529; ALL-NEXT: vmovd %eax, %xmm3
530; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
531; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
532; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
533; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
534; ALL-NEXT: retq
535 %1 = load <4 x i16>, <4 x i16>* %a0
536 %2 = bitcast <4 x i16> %1 to <4 x half>
537 %3 = fpext <4 x half> %2 to <4 x float>
538 ret <4 x float> %3
539}
540
541define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) {
542; ALL-LABEL: load_cvt_8i16_to_4f32:
543; ALL: # BB#0:
544; ALL-NEXT: movq (%rdi), %rax
545; ALL-NEXT: movq %rax, %rcx
546; ALL-NEXT: movq %rax, %rdx
547; ALL-NEXT: movswl %ax, %esi
548; ALL-NEXT: shrl $16, %eax
549; ALL-NEXT: shrq $32, %rcx
550; ALL-NEXT: shrq $48, %rdx
551; ALL-NEXT: movswl %dx, %edx
552; ALL-NEXT: vmovd %edx, %xmm0
553; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
554; ALL-NEXT: movswl %cx, %ecx
555; ALL-NEXT: vmovd %ecx, %xmm1
556; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
557; ALL-NEXT: cwtl
558; ALL-NEXT: vmovd %eax, %xmm2
559; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
560; ALL-NEXT: vmovd %esi, %xmm3
561; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
562; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
563; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
564; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
565; ALL-NEXT: retq
566 %1 = load <8 x i16>, <8 x i16>* %a0
567 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
568 %3 = bitcast <4 x i16> %2 to <4 x half>
569 %4 = fpext <4 x half> %3 to <4 x float>
570 ret <4 x float> %4
571}
572
573define <8 x float> @load_cvt_8i16_to_8f32(<8 x i16>* %a0) {
574; AVX1-LABEL: load_cvt_8i16_to_8f32:
575; AVX1: # BB#0:
576; AVX1-NEXT: movswl 6(%rdi), %eax
577; AVX1-NEXT: vmovd %eax, %xmm0
578; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
579; AVX1-NEXT: movswl 4(%rdi), %eax
580; AVX1-NEXT: vmovd %eax, %xmm1
581; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
582; AVX1-NEXT: movswl (%rdi), %eax
583; AVX1-NEXT: vmovd %eax, %xmm2
584; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
585; AVX1-NEXT: movswl 2(%rdi), %eax
586; AVX1-NEXT: vmovd %eax, %xmm3
587; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
588; AVX1-NEXT: movswl 14(%rdi), %eax
589; AVX1-NEXT: vmovd %eax, %xmm4
590; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4
591; AVX1-NEXT: movswl 12(%rdi), %eax
592; AVX1-NEXT: vmovd %eax, %xmm5
593; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
594; AVX1-NEXT: movswl 8(%rdi), %eax
595; AVX1-NEXT: vmovd %eax, %xmm6
596; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
597; AVX1-NEXT: movswl 10(%rdi), %eax
598; AVX1-NEXT: vmovd %eax, %xmm7
599; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
600; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
601; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
602; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
603; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
604; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
605; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
606; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
607; AVX1-NEXT: retq
608;
609; AVX2-LABEL: load_cvt_8i16_to_8f32:
610; AVX2: # BB#0:
611; AVX2-NEXT: movswl 6(%rdi), %eax
612; AVX2-NEXT: vmovd %eax, %xmm0
613; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
614; AVX2-NEXT: movswl 4(%rdi), %eax
615; AVX2-NEXT: vmovd %eax, %xmm1
616; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
617; AVX2-NEXT: movswl (%rdi), %eax
618; AVX2-NEXT: vmovd %eax, %xmm2
619; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
620; AVX2-NEXT: movswl 2(%rdi), %eax
621; AVX2-NEXT: vmovd %eax, %xmm3
622; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
623; AVX2-NEXT: movswl 14(%rdi), %eax
624; AVX2-NEXT: vmovd %eax, %xmm4
625; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4
626; AVX2-NEXT: movswl 12(%rdi), %eax
627; AVX2-NEXT: vmovd %eax, %xmm5
628; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
629; AVX2-NEXT: movswl 8(%rdi), %eax
630; AVX2-NEXT: vmovd %eax, %xmm6
631; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
632; AVX2-NEXT: movswl 10(%rdi), %eax
633; AVX2-NEXT: vmovd %eax, %xmm7
634; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
635; AVX2-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
636; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
637; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
638; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
639; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
640; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
641; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
642; AVX2-NEXT: retq
643;
644; AVX512-LABEL: load_cvt_8i16_to_8f32:
645; AVX512: # BB#0:
646; AVX512-NEXT: movswl 6(%rdi), %eax
647; AVX512-NEXT: vmovd %eax, %xmm0
648; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
649; AVX512-NEXT: movswl 4(%rdi), %eax
650; AVX512-NEXT: vmovd %eax, %xmm1
651; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
652; AVX512-NEXT: movswl (%rdi), %eax
653; AVX512-NEXT: vmovd %eax, %xmm2
654; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
655; AVX512-NEXT: movswl 2(%rdi), %eax
656; AVX512-NEXT: vmovd %eax, %xmm3
657; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
658; AVX512-NEXT: movswl 14(%rdi), %eax
659; AVX512-NEXT: vmovd %eax, %xmm4
660; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
661; AVX512-NEXT: movswl 12(%rdi), %eax
662; AVX512-NEXT: vmovd %eax, %xmm5
663; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
664; AVX512-NEXT: movswl 8(%rdi), %eax
665; AVX512-NEXT: vmovd %eax, %xmm6
666; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
667; AVX512-NEXT: movswl 10(%rdi), %eax
668; AVX512-NEXT: vmovd %eax, %xmm7
669; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
670; AVX512-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
671; AVX512-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
672; AVX512-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
673; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
674; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
675; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
676; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
677; AVX512-NEXT: retq
678 %1 = load <8 x i16>, <8 x i16>* %a0
679 %2 = bitcast <8 x i16> %1 to <8 x half>
680 %3 = fpext <8 x half> %2 to <8 x float>
681 ret <8 x float> %3
682}
683
684define <16 x float> @load_cvt_16i16_to_16f32(<16 x i16>* %a0) {
685; AVX1-LABEL: load_cvt_16i16_to_16f32:
686; AVX1: # BB#0:
687; AVX1-NEXT: movswl 22(%rdi), %eax
688; AVX1-NEXT: vmovd %eax, %xmm0
689; AVX1-NEXT: vcvtph2ps %xmm0, %xmm8
690; AVX1-NEXT: movswl 20(%rdi), %eax
691; AVX1-NEXT: vmovd %eax, %xmm0
692; AVX1-NEXT: vcvtph2ps %xmm0, %xmm9
693; AVX1-NEXT: movswl 16(%rdi), %eax
694; AVX1-NEXT: vmovd %eax, %xmm0
695; AVX1-NEXT: vcvtph2ps %xmm0, %xmm10
696; AVX1-NEXT: movswl 18(%rdi), %eax
697; AVX1-NEXT: vmovd %eax, %xmm0
698; AVX1-NEXT: vcvtph2ps %xmm0, %xmm11
699; AVX1-NEXT: movswl 30(%rdi), %eax
700; AVX1-NEXT: vmovd %eax, %xmm0
701; AVX1-NEXT: vcvtph2ps %xmm0, %xmm12
702; AVX1-NEXT: movswl 28(%rdi), %eax
703; AVX1-NEXT: vmovd %eax, %xmm0
704; AVX1-NEXT: vcvtph2ps %xmm0, %xmm13
705; AVX1-NEXT: movswl 24(%rdi), %eax
706; AVX1-NEXT: vmovd %eax, %xmm0
707; AVX1-NEXT: vcvtph2ps %xmm0, %xmm14
708; AVX1-NEXT: movswl 26(%rdi), %eax
709; AVX1-NEXT: vmovd %eax, %xmm0
710; AVX1-NEXT: vcvtph2ps %xmm0, %xmm15
711; AVX1-NEXT: movswl 6(%rdi), %eax
712; AVX1-NEXT: vmovd %eax, %xmm0
713; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
714; AVX1-NEXT: movswl 4(%rdi), %eax
715; AVX1-NEXT: vmovd %eax, %xmm2
716; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
717; AVX1-NEXT: movswl (%rdi), %eax
718; AVX1-NEXT: vmovd %eax, %xmm3
719; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
720; AVX1-NEXT: movswl 2(%rdi), %eax
721; AVX1-NEXT: vmovd %eax, %xmm4
722; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4
723; AVX1-NEXT: movswl 14(%rdi), %eax
724; AVX1-NEXT: vmovd %eax, %xmm5
725; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
726; AVX1-NEXT: movswl 12(%rdi), %eax
727; AVX1-NEXT: vmovd %eax, %xmm6
728; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
729; AVX1-NEXT: movswl 8(%rdi), %eax
730; AVX1-NEXT: vmovd %eax, %xmm7
731; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
732; AVX1-NEXT: movswl 10(%rdi), %eax
733; AVX1-NEXT: vmovd %eax, %xmm1
734; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
735; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3]
736; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
737; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
738; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3]
739; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3]
740; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
741; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
742; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
743; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
744; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
745; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
746; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
747; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
748; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
749; AVX1-NEXT: retq
750;
751; AVX2-LABEL: load_cvt_16i16_to_16f32:
752; AVX2: # BB#0:
753; AVX2-NEXT: movswl 22(%rdi), %eax
754; AVX2-NEXT: vmovd %eax, %xmm0
755; AVX2-NEXT: vcvtph2ps %xmm0, %xmm8
756; AVX2-NEXT: movswl 20(%rdi), %eax
757; AVX2-NEXT: vmovd %eax, %xmm0
758; AVX2-NEXT: vcvtph2ps %xmm0, %xmm9
759; AVX2-NEXT: movswl 16(%rdi), %eax
760; AVX2-NEXT: vmovd %eax, %xmm0
761; AVX2-NEXT: vcvtph2ps %xmm0, %xmm10
762; AVX2-NEXT: movswl 18(%rdi), %eax
763; AVX2-NEXT: vmovd %eax, %xmm0
764; AVX2-NEXT: vcvtph2ps %xmm0, %xmm11
765; AVX2-NEXT: movswl 30(%rdi), %eax
766; AVX2-NEXT: vmovd %eax, %xmm0
767; AVX2-NEXT: vcvtph2ps %xmm0, %xmm12
768; AVX2-NEXT: movswl 28(%rdi), %eax
769; AVX2-NEXT: vmovd %eax, %xmm0
770; AVX2-NEXT: vcvtph2ps %xmm0, %xmm13
771; AVX2-NEXT: movswl 24(%rdi), %eax
772; AVX2-NEXT: vmovd %eax, %xmm0
773; AVX2-NEXT: vcvtph2ps %xmm0, %xmm14
774; AVX2-NEXT: movswl 26(%rdi), %eax
775; AVX2-NEXT: vmovd %eax, %xmm0
776; AVX2-NEXT: vcvtph2ps %xmm0, %xmm15
777; AVX2-NEXT: movswl 6(%rdi), %eax
778; AVX2-NEXT: vmovd %eax, %xmm0
779; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
780; AVX2-NEXT: movswl 4(%rdi), %eax
781; AVX2-NEXT: vmovd %eax, %xmm2
782; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
783; AVX2-NEXT: movswl (%rdi), %eax
784; AVX2-NEXT: vmovd %eax, %xmm3
785; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
786; AVX2-NEXT: movswl 2(%rdi), %eax
787; AVX2-NEXT: vmovd %eax, %xmm4
788; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4
789; AVX2-NEXT: movswl 14(%rdi), %eax
790; AVX2-NEXT: vmovd %eax, %xmm5
791; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
792; AVX2-NEXT: movswl 12(%rdi), %eax
793; AVX2-NEXT: vmovd %eax, %xmm6
794; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
795; AVX2-NEXT: movswl 8(%rdi), %eax
796; AVX2-NEXT: vmovd %eax, %xmm7
797; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
798; AVX2-NEXT: movswl 10(%rdi), %eax
799; AVX2-NEXT: vmovd %eax, %xmm1
800; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
801; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3]
802; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0],xmm1[3]
803; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[0]
804; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3]
805; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3]
806; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
807; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
808; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
809; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
810; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
811; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
812; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
813; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
814; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
815; AVX2-NEXT: retq
816;
817; AVX512-LABEL: load_cvt_16i16_to_16f32:
818; AVX512: # BB#0:
819; AVX512-NEXT: movswl 6(%rdi), %eax
820; AVX512-NEXT: vmovd %eax, %xmm0
821; AVX512-NEXT: vcvtph2ps %xmm0, %xmm8
822; AVX512-NEXT: movswl 4(%rdi), %eax
823; AVX512-NEXT: vmovd %eax, %xmm0
824; AVX512-NEXT: vcvtph2ps %xmm0, %xmm9
825; AVX512-NEXT: movswl (%rdi), %eax
826; AVX512-NEXT: vmovd %eax, %xmm0
827; AVX512-NEXT: vcvtph2ps %xmm0, %xmm10
828; AVX512-NEXT: movswl 2(%rdi), %eax
829; AVX512-NEXT: vmovd %eax, %xmm0
830; AVX512-NEXT: vcvtph2ps %xmm0, %xmm11
831; AVX512-NEXT: movswl 14(%rdi), %eax
832; AVX512-NEXT: vmovd %eax, %xmm0
833; AVX512-NEXT: vcvtph2ps %xmm0, %xmm12
834; AVX512-NEXT: movswl 12(%rdi), %eax
835; AVX512-NEXT: vmovd %eax, %xmm0
836; AVX512-NEXT: vcvtph2ps %xmm0, %xmm13
837; AVX512-NEXT: movswl 8(%rdi), %eax
838; AVX512-NEXT: vmovd %eax, %xmm0
839; AVX512-NEXT: vcvtph2ps %xmm0, %xmm14
840; AVX512-NEXT: movswl 10(%rdi), %eax
841; AVX512-NEXT: vmovd %eax, %xmm0
842; AVX512-NEXT: vcvtph2ps %xmm0, %xmm15
843; AVX512-NEXT: movswl 22(%rdi), %eax
844; AVX512-NEXT: vmovd %eax, %xmm0
845; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
846; AVX512-NEXT: movswl 20(%rdi), %eax
847; AVX512-NEXT: vmovd %eax, %xmm1
848; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
849; AVX512-NEXT: movswl 16(%rdi), %eax
850; AVX512-NEXT: vmovd %eax, %xmm2
851; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
852; AVX512-NEXT: movswl 18(%rdi), %eax
853; AVX512-NEXT: vmovd %eax, %xmm3
854; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
855; AVX512-NEXT: movswl 30(%rdi), %eax
856; AVX512-NEXT: vmovd %eax, %xmm4
857; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
858; AVX512-NEXT: movswl 28(%rdi), %eax
859; AVX512-NEXT: vmovd %eax, %xmm5
860; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
861; AVX512-NEXT: movswl 24(%rdi), %eax
862; AVX512-NEXT: vmovd %eax, %xmm6
863; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
864; AVX512-NEXT: movswl 26(%rdi), %eax
865; AVX512-NEXT: vmovd %eax, %xmm7
866; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
867; AVX512-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
868; AVX512-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
869; AVX512-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
870; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
871; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
872; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
873; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
874; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
875; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
876; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
877; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
878; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
879; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
880; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
881; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
882; AVX512-NEXT: retq
883 %1 = load <16 x i16>, <16 x i16>* %a0
884 %2 = bitcast <16 x i16> %1 to <16 x half>
885 %3 = fpext <16 x half> %2 to <16 x float>
886 ret <16 x float> %3
887}
888
889;
890; Float to Half
891;
892
893define i16 @cvt_f32_to_i16(float %a0) {
894; ALL-LABEL: cvt_f32_to_i16:
895; ALL: # BB#0:
896; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
897; ALL-NEXT: vmovd %xmm0, %eax
898; ALL-NEXT: retq
899 %1 = fptrunc float %a0 to half
900 %2 = bitcast half %1 to i16
901 ret i16 %2
902}
903
904define <4 x i16> @cvt_4f32_to_4i16(<4 x float> %a0) {
905; ALL-LABEL: cvt_4f32_to_4i16:
906; ALL: # BB#0:
907; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
908; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
909; ALL-NEXT: vmovd %xmm1, %eax
910; ALL-NEXT: shll $16, %eax
911; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
912; ALL-NEXT: vmovd %xmm1, %ecx
913; ALL-NEXT: movzwl %cx, %ecx
914; ALL-NEXT: orl %eax, %ecx
915; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
916; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
917; ALL-NEXT: vmovd %xmm1, %eax
918; ALL-NEXT: shll $16, %eax
919; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
920; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
921; ALL-NEXT: vmovd %xmm0, %edx
922; ALL-NEXT: movzwl %dx, %edx
923; ALL-NEXT: orl %eax, %edx
924; ALL-NEXT: shlq $32, %rdx
925; ALL-NEXT: orq %rcx, %rdx
926; ALL-NEXT: vmovq %rdx, %xmm0
927; ALL-NEXT: retq
928 %1 = fptrunc <4 x float> %a0 to <4 x half>
929 %2 = bitcast <4 x half> %1 to <4 x i16>
930 ret <4 x i16> %2
931}
932
933define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) {
934; ALL-LABEL: cvt_4f32_to_8i16_undef:
935; ALL: # BB#0:
936; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
937; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
938; ALL-NEXT: vmovd %xmm1, %eax
939; ALL-NEXT: shll $16, %eax
940; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
941; ALL-NEXT: vmovd %xmm1, %ecx
942; ALL-NEXT: movzwl %cx, %ecx
943; ALL-NEXT: orl %eax, %ecx
944; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
945; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
946; ALL-NEXT: vmovd %xmm1, %eax
947; ALL-NEXT: shll $16, %eax
948; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
949; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
950; ALL-NEXT: vmovd %xmm0, %edx
951; ALL-NEXT: movzwl %dx, %edx
952; ALL-NEXT: orl %eax, %edx
953; ALL-NEXT: shlq $32, %rdx
954; ALL-NEXT: orq %rcx, %rdx
955; ALL-NEXT: vmovq %rdx, %xmm0
956; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
957; ALL-NEXT: retq
958 %1 = fptrunc <4 x float> %a0 to <4 x half>
959 %2 = bitcast <4 x half> %1 to <4 x i16>
960 %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
961 ret <8 x i16> %3
962}
963
964define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) {
965; ALL-LABEL: cvt_4f32_to_8i16_zero:
966; ALL: # BB#0:
967; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
968; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
969; ALL-NEXT: vmovd %xmm1, %eax
970; ALL-NEXT: shll $16, %eax
971; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
972; ALL-NEXT: vmovd %xmm1, %ecx
973; ALL-NEXT: movzwl %cx, %ecx
974; ALL-NEXT: orl %eax, %ecx
975; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
976; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
977; ALL-NEXT: vmovd %xmm1, %eax
978; ALL-NEXT: shll $16, %eax
979; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
980; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
981; ALL-NEXT: vmovd %xmm0, %edx
982; ALL-NEXT: movzwl %dx, %edx
983; ALL-NEXT: orl %eax, %edx
984; ALL-NEXT: shlq $32, %rdx
985; ALL-NEXT: orq %rcx, %rdx
986; ALL-NEXT: vmovq %rdx, %xmm0
987; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
988; ALL-NEXT: retq
989 %1 = fptrunc <4 x float> %a0 to <4 x half>
990 %2 = bitcast <4 x half> %1 to <4 x i16>
991 %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
992 ret <8 x i16> %3
993}
994
995define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) {
996; AVX1-LABEL: cvt_8f32_to_8i16:
997; AVX1: # BB#0:
998; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
999; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1000; AVX1-NEXT: vmovd %xmm1, %eax
1001; AVX1-NEXT: shll $16, %eax
1002; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1003; AVX1-NEXT: vmovd %xmm1, %ecx
1004; AVX1-NEXT: movzwl %cx, %ecx
1005; AVX1-NEXT: orl %eax, %ecx
1006; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1007; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1008; AVX1-NEXT: vmovd %xmm1, %edx
1009; AVX1-NEXT: shll $16, %edx
1010; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1011; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1012; AVX1-NEXT: vmovd %xmm1, %eax
1013; AVX1-NEXT: movzwl %ax, %eax
1014; AVX1-NEXT: orl %edx, %eax
1015; AVX1-NEXT: shlq $32, %rax
1016; AVX1-NEXT: orq %rcx, %rax
1017; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1018; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1019; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1020; AVX1-NEXT: vmovd %xmm1, %ecx
1021; AVX1-NEXT: shll $16, %ecx
1022; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1023; AVX1-NEXT: vmovd %xmm1, %edx
1024; AVX1-NEXT: movzwl %dx, %edx
1025; AVX1-NEXT: orl %ecx, %edx
1026; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1027; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1028; AVX1-NEXT: vmovd %xmm1, %ecx
1029; AVX1-NEXT: shll $16, %ecx
1030; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1031; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1032; AVX1-NEXT: vmovd %xmm0, %esi
1033; AVX1-NEXT: movzwl %si, %esi
1034; AVX1-NEXT: orl %ecx, %esi
1035; AVX1-NEXT: shlq $32, %rsi
1036; AVX1-NEXT: orq %rdx, %rsi
1037; AVX1-NEXT: vmovq %rsi, %xmm0
1038; AVX1-NEXT: vmovq %rax, %xmm1
1039; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1040; AVX1-NEXT: vzeroupper
1041; AVX1-NEXT: retq
1042;
1043; AVX2-LABEL: cvt_8f32_to_8i16:
1044; AVX2: # BB#0:
1045; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1046; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1047; AVX2-NEXT: vmovd %xmm1, %eax
1048; AVX2-NEXT: shll $16, %eax
1049; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1050; AVX2-NEXT: vmovd %xmm1, %ecx
1051; AVX2-NEXT: movzwl %cx, %ecx
1052; AVX2-NEXT: orl %eax, %ecx
1053; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1054; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1055; AVX2-NEXT: vmovd %xmm1, %edx
1056; AVX2-NEXT: shll $16, %edx
1057; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1058; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1059; AVX2-NEXT: vmovd %xmm1, %eax
1060; AVX2-NEXT: movzwl %ax, %eax
1061; AVX2-NEXT: orl %edx, %eax
1062; AVX2-NEXT: shlq $32, %rax
1063; AVX2-NEXT: orq %rcx, %rax
1064; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
1065; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1066; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1067; AVX2-NEXT: vmovd %xmm1, %ecx
1068; AVX2-NEXT: shll $16, %ecx
1069; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1070; AVX2-NEXT: vmovd %xmm1, %edx
1071; AVX2-NEXT: movzwl %dx, %edx
1072; AVX2-NEXT: orl %ecx, %edx
1073; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1074; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1075; AVX2-NEXT: vmovd %xmm1, %ecx
1076; AVX2-NEXT: shll $16, %ecx
1077; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1078; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1079; AVX2-NEXT: vmovd %xmm0, %esi
1080; AVX2-NEXT: movzwl %si, %esi
1081; AVX2-NEXT: orl %ecx, %esi
1082; AVX2-NEXT: shlq $32, %rsi
1083; AVX2-NEXT: orq %rdx, %rsi
1084; AVX2-NEXT: vmovq %rsi, %xmm0
1085; AVX2-NEXT: vmovq %rax, %xmm1
1086; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1087; AVX2-NEXT: vzeroupper
1088; AVX2-NEXT: retq
1089;
1090; AVX512-LABEL: cvt_8f32_to_8i16:
1091; AVX512: # BB#0:
1092; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1093; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1094; AVX512-NEXT: vmovd %xmm1, %eax
1095; AVX512-NEXT: shll $16, %eax
1096; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1097; AVX512-NEXT: vmovd %xmm1, %ecx
1098; AVX512-NEXT: movzwl %cx, %ecx
1099; AVX512-NEXT: orl %eax, %ecx
1100; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1101; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1102; AVX512-NEXT: vmovd %xmm1, %edx
1103; AVX512-NEXT: shll $16, %edx
1104; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1105; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1106; AVX512-NEXT: vmovd %xmm1, %eax
1107; AVX512-NEXT: movzwl %ax, %eax
1108; AVX512-NEXT: orl %edx, %eax
1109; AVX512-NEXT: shlq $32, %rax
1110; AVX512-NEXT: orq %rcx, %rax
1111; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
1112; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1113; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1114; AVX512-NEXT: vmovd %xmm1, %ecx
1115; AVX512-NEXT: shll $16, %ecx
1116; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1117; AVX512-NEXT: vmovd %xmm1, %edx
1118; AVX512-NEXT: movzwl %dx, %edx
1119; AVX512-NEXT: orl %ecx, %edx
1120; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1121; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1122; AVX512-NEXT: vmovd %xmm1, %ecx
1123; AVX512-NEXT: shll $16, %ecx
1124; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1125; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1126; AVX512-NEXT: vmovd %xmm0, %esi
1127; AVX512-NEXT: movzwl %si, %esi
1128; AVX512-NEXT: orl %ecx, %esi
1129; AVX512-NEXT: shlq $32, %rsi
1130; AVX512-NEXT: orq %rdx, %rsi
1131; AVX512-NEXT: vmovq %rsi, %xmm0
1132; AVX512-NEXT: vmovq %rax, %xmm1
1133; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1134; AVX512-NEXT: retq
1135 %1 = fptrunc <8 x float> %a0 to <8 x half>
1136 %2 = bitcast <8 x half> %1 to <8 x i16>
1137 ret <8 x i16> %2
1138}
1139
1140define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) {
1141; AVX1-LABEL: cvt_16f32_to_16i16:
1142; AVX1: # BB#0:
1143; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm2
1144; AVX1-NEXT: vmovd %xmm2, %eax
1145; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1146; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1147; AVX1-NEXT: vmovd %eax, %xmm3
1148; AVX1-NEXT: vmovd %xmm2, %eax
1149; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1150; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1151; AVX1-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
1152; AVX1-NEXT: vmovd %xmm2, %eax
1153; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
1154; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
1155; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1156; AVX1-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
1157; AVX1-NEXT: vmovd %xmm1, %eax
1158; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm1
1159; AVX1-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
1160; AVX1-NEXT: vmovd %xmm1, %eax
1161; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
1162; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1163; AVX1-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
1164; AVX1-NEXT: vmovd %xmm1, %eax
1165; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1166; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1167; AVX1-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
1168; AVX1-NEXT: vmovd %xmm1, %eax
1169; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1170; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
1171; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1172; AVX1-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
1173; AVX1-NEXT: vmovd %xmm2, %eax
1174; AVX1-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2
1175; AVX1-NEXT: vmovd %xmm1, %eax
1176; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1177; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1178; AVX1-NEXT: vmovd %eax, %xmm3
1179; AVX1-NEXT: vmovd %xmm1, %eax
1180; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1181; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1182; AVX1-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
1183; AVX1-NEXT: vmovd %xmm1, %eax
1184; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1185; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
1186; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1187; AVX1-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
1188; AVX1-NEXT: vmovd %xmm0, %eax
1189; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm0
1190; AVX1-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
1191; AVX1-NEXT: vmovd %xmm0, %eax
1192; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
1193; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1194; AVX1-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
1195; AVX1-NEXT: vmovd %xmm0, %eax
1196; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
1197; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1198; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1199; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1200; AVX1-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
1201; AVX1-NEXT: vmovd %xmm1, %eax
1202; AVX1-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1
1203; AVX1-NEXT: vmovd %xmm0, %eax
1204; AVX1-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
1205; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
1206; AVX1-NEXT: retq
1207;
1208; AVX2-LABEL: cvt_16f32_to_16i16:
1209; AVX2: # BB#0:
1210; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm2
1211; AVX2-NEXT: vmovd %xmm2, %eax
1212; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1213; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1214; AVX2-NEXT: vmovd %eax, %xmm3
1215; AVX2-NEXT: vmovd %xmm2, %eax
1216; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1217; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1218; AVX2-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
1219; AVX2-NEXT: vmovd %xmm2, %eax
1220; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
1221; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
1222; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1223; AVX2-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
1224; AVX2-NEXT: vmovd %xmm1, %eax
1225; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm1
1226; AVX2-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
1227; AVX2-NEXT: vmovd %xmm1, %eax
1228; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
1229; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1230; AVX2-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
1231; AVX2-NEXT: vmovd %xmm1, %eax
1232; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1233; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1234; AVX2-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
1235; AVX2-NEXT: vmovd %xmm1, %eax
1236; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1237; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
1238; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1239; AVX2-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
1240; AVX2-NEXT: vmovd %xmm2, %eax
1241; AVX2-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2
1242; AVX2-NEXT: vmovd %xmm1, %eax
1243; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1244; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1245; AVX2-NEXT: vmovd %eax, %xmm3
1246; AVX2-NEXT: vmovd %xmm1, %eax
1247; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1248; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1249; AVX2-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
1250; AVX2-NEXT: vmovd %xmm1, %eax
1251; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
1252; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
1253; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1254; AVX2-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
1255; AVX2-NEXT: vmovd %xmm0, %eax
1256; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm0
1257; AVX2-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
1258; AVX2-NEXT: vmovd %xmm0, %eax
1259; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
1260; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1261; AVX2-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
1262; AVX2-NEXT: vmovd %xmm0, %eax
1263; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
1264; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1265; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1266; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1267; AVX2-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
1268; AVX2-NEXT: vmovd %xmm1, %eax
1269; AVX2-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1
1270; AVX2-NEXT: vmovd %xmm0, %eax
1271; AVX2-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
1272; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
1273; AVX2-NEXT: retq
1274;
1275; AVX512-LABEL: cvt_16f32_to_16i16:
1276; AVX512: # BB#0:
1277; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
1278; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm2
1279; AVX512-NEXT: vmovd %xmm2, %eax
1280; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1281; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1282; AVX512-NEXT: vmovd %eax, %xmm3
1283; AVX512-NEXT: vmovd %xmm2, %eax
1284; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1285; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1286; AVX512-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
1287; AVX512-NEXT: vmovd %xmm2, %eax
1288; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
1289; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
1290; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1291; AVX512-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
1292; AVX512-NEXT: vmovd %xmm1, %eax
1293; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm1
1294; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
1295; AVX512-NEXT: vmovd %xmm1, %eax
1296; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
1297; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1298; AVX512-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
1299; AVX512-NEXT: vmovd %xmm1, %eax
1300; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
1301; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1302; AVX512-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
1303; AVX512-NEXT: vmovd %xmm1, %eax
1304; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1305; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
1306; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1307; AVX512-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
1308; AVX512-NEXT: vmovd %xmm2, %eax
1309; AVX512-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2
1310; AVX512-NEXT: vmovd %xmm1, %eax
1311; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1312; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1313; AVX512-NEXT: vmovd %eax, %xmm3
1314; AVX512-NEXT: vmovd %xmm1, %eax
1315; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1316; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1317; AVX512-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
1318; AVX512-NEXT: vmovd %xmm1, %eax
1319; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
1320; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
1321; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1322; AVX512-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
1323; AVX512-NEXT: vmovd %xmm0, %eax
1324; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm0
1325; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
1326; AVX512-NEXT: vmovd %xmm0, %eax
1327; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
1328; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1329; AVX512-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
1330; AVX512-NEXT: vmovd %xmm0, %eax
1331; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
1332; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1333; AVX512-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
1334; AVX512-NEXT: vmovd %xmm0, %eax
1335; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
1336; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1337; AVX512-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1
1338; AVX512-NEXT: vmovd %xmm0, %eax
1339; AVX512-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
1340; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
1341; AVX512-NEXT: retq
1342 %1 = fptrunc <16 x float> %a0 to <16 x half>
1343 %2 = bitcast <16 x half> %1 to <16 x i16>
1344 ret <16 x i16> %2
1345}
1346
1347;
1348; Float to Half (Store)
1349;
1350
1351define void @store_cvt_4f32_to_4i16(<4 x float> %a0, <4 x i16>* %a1) {
1352; ALL-LABEL: store_cvt_4f32_to_4i16:
1353; ALL: # BB#0:
1354; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1355; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1356; ALL-NEXT: vmovd %xmm1, %eax
1357; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1358; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1359; ALL-NEXT: vmovd %xmm1, %ecx
1360; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1361; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1362; ALL-NEXT: vmovd %xmm1, %edx
1363; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1364; ALL-NEXT: vmovd %xmm0, %esi
1365; ALL-NEXT: movw %si, (%rdi)
1366; ALL-NEXT: movw %dx, 6(%rdi)
1367; ALL-NEXT: movw %cx, 4(%rdi)
1368; ALL-NEXT: movw %ax, 2(%rdi)
1369; ALL-NEXT: retq
1370 %1 = fptrunc <4 x float> %a0 to <4 x half>
1371 %2 = bitcast <4 x half> %1 to <4 x i16>
1372 store <4 x i16> %2, <4 x i16>* %a1
1373 ret void
1374}
1375
1376define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) {
1377; ALL-LABEL: store_cvt_4f32_to_8i16_undef:
1378; ALL: # BB#0:
1379; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1380; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1381; ALL-NEXT: vmovd %xmm1, %eax
1382; ALL-NEXT: shll $16, %eax
1383; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1384; ALL-NEXT: vmovd %xmm1, %ecx
1385; ALL-NEXT: movzwl %cx, %ecx
1386; ALL-NEXT: orl %eax, %ecx
1387; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1388; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1389; ALL-NEXT: vmovd %xmm1, %eax
1390; ALL-NEXT: shll $16, %eax
1391; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1392; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1393; ALL-NEXT: vmovd %xmm0, %edx
1394; ALL-NEXT: movzwl %dx, %edx
1395; ALL-NEXT: orl %eax, %edx
1396; ALL-NEXT: shlq $32, %rdx
1397; ALL-NEXT: orq %rcx, %rdx
1398; ALL-NEXT: vmovq %rdx, %xmm0
1399; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1400; ALL-NEXT: vmovdqa %xmm0, (%rdi)
1401; ALL-NEXT: retq
1402 %1 = fptrunc <4 x float> %a0 to <4 x half>
1403 %2 = bitcast <4 x half> %1 to <4 x i16>
1404 %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1405 store <8 x i16> %3, <8 x i16>* %a1
1406 ret void
1407}
1408
1409define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) {
1410; ALL-LABEL: store_cvt_4f32_to_8i16_zero:
1411; ALL: # BB#0:
1412; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1413; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1414; ALL-NEXT: vmovd %xmm1, %eax
1415; ALL-NEXT: shll $16, %eax
1416; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
1417; ALL-NEXT: vmovd %xmm1, %ecx
1418; ALL-NEXT: movzwl %cx, %ecx
1419; ALL-NEXT: orl %eax, %ecx
1420; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1421; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1422; ALL-NEXT: vmovd %xmm1, %eax
1423; ALL-NEXT: shll $16, %eax
1424; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1425; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1426; ALL-NEXT: vmovd %xmm0, %edx
1427; ALL-NEXT: movzwl %dx, %edx
1428; ALL-NEXT: orl %eax, %edx
1429; ALL-NEXT: shlq $32, %rdx
1430; ALL-NEXT: orq %rcx, %rdx
1431; ALL-NEXT: vmovq %rdx, %xmm0
1432; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
1433; ALL-NEXT: vmovdqa %xmm0, (%rdi)
1434; ALL-NEXT: retq
1435 %1 = fptrunc <4 x float> %a0 to <4 x half>
1436 %2 = bitcast <4 x half> %1 to <4 x i16>
1437 %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1438 store <8 x i16> %3, <8 x i16>* %a1
1439 ret void
1440}
1441
1442define void @store_cvt_8f32_to_8i16(<8 x float> %a0, <8 x i16>* %a1) {
1443; AVX1-LABEL: store_cvt_8f32_to_8i16:
1444; AVX1: # BB#0:
1445; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1446; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1447; AVX1-NEXT: vmovd %xmm1, %r8d
1448; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1449; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1450; AVX1-NEXT: vmovd %xmm1, %r9d
1451; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1452; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1453; AVX1-NEXT: vmovd %xmm1, %r10d
1454; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1455; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1456; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1457; AVX1-NEXT: vmovd %xmm2, %r11d
1458; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1459; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1460; AVX1-NEXT: vmovd %xmm2, %eax
1461; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
1462; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1463; AVX1-NEXT: vmovd %xmm2, %ecx
1464; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1465; AVX1-NEXT: vmovd %xmm0, %edx
1466; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm0
1467; AVX1-NEXT: vmovd %xmm0, %esi
1468; AVX1-NEXT: movw %si, 8(%rdi)
1469; AVX1-NEXT: movw %dx, (%rdi)
1470; AVX1-NEXT: movw %cx, 14(%rdi)
1471; AVX1-NEXT: movw %ax, 12(%rdi)
1472; AVX1-NEXT: movw %r11w, 10(%rdi)
1473; AVX1-NEXT: movw %r10w, 6(%rdi)
1474; AVX1-NEXT: movw %r9w, 4(%rdi)
1475; AVX1-NEXT: movw %r8w, 2(%rdi)
1476; AVX1-NEXT: vzeroupper
1477; AVX1-NEXT: retq
1478;
1479; AVX2-LABEL: store_cvt_8f32_to_8i16:
1480; AVX2: # BB#0:
1481; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1482; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1483; AVX2-NEXT: vmovd %xmm1, %r8d
1484; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1485; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1486; AVX2-NEXT: vmovd %xmm1, %r9d
1487; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1488; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1489; AVX2-NEXT: vmovd %xmm1, %r10d
1490; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
1491; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1492; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1493; AVX2-NEXT: vmovd %xmm2, %r11d
1494; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1495; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1496; AVX2-NEXT: vmovd %xmm2, %eax
1497; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
1498; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1499; AVX2-NEXT: vmovd %xmm2, %ecx
1500; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1501; AVX2-NEXT: vmovd %xmm0, %edx
1502; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm0
1503; AVX2-NEXT: vmovd %xmm0, %esi
1504; AVX2-NEXT: movw %si, 8(%rdi)
1505; AVX2-NEXT: movw %dx, (%rdi)
1506; AVX2-NEXT: movw %cx, 14(%rdi)
1507; AVX2-NEXT: movw %ax, 12(%rdi)
1508; AVX2-NEXT: movw %r11w, 10(%rdi)
1509; AVX2-NEXT: movw %r10w, 6(%rdi)
1510; AVX2-NEXT: movw %r9w, 4(%rdi)
1511; AVX2-NEXT: movw %r8w, 2(%rdi)
1512; AVX2-NEXT: vzeroupper
1513; AVX2-NEXT: retq
1514;
1515; AVX512-LABEL: store_cvt_8f32_to_8i16:
1516; AVX512: # BB#0:
1517; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1518; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1519; AVX512-NEXT: vmovd %xmm1, %r8d
1520; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1521; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1522; AVX512-NEXT: vmovd %xmm1, %r9d
1523; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
1524; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1525; AVX512-NEXT: vmovd %xmm1, %r10d
1526; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
1527; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1528; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1529; AVX512-NEXT: vmovd %xmm2, %r11d
1530; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1531; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1532; AVX512-NEXT: vmovd %xmm2, %eax
1533; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
1534; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1535; AVX512-NEXT: vmovd %xmm2, %ecx
1536; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1537; AVX512-NEXT: vmovd %xmm0, %edx
1538; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm0
1539; AVX512-NEXT: vmovd %xmm0, %esi
1540; AVX512-NEXT: movw %si, 8(%rdi)
1541; AVX512-NEXT: movw %dx, (%rdi)
1542; AVX512-NEXT: movw %cx, 14(%rdi)
1543; AVX512-NEXT: movw %ax, 12(%rdi)
1544; AVX512-NEXT: movw %r11w, 10(%rdi)
1545; AVX512-NEXT: movw %r10w, 6(%rdi)
1546; AVX512-NEXT: movw %r9w, 4(%rdi)
1547; AVX512-NEXT: movw %r8w, 2(%rdi)
1548; AVX512-NEXT: retq
1549 %1 = fptrunc <8 x float> %a0 to <8 x half>
1550 %2 = bitcast <8 x half> %1 to <8 x i16>
1551 store <8 x i16> %2, <8 x i16>* %a1
1552 ret void
1553}
1554
1555define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) {
1556; AVX1-LABEL: store_cvt_16f32_to_16i16:
1557; AVX1: # BB#0:
1558; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
1559; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
1560; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm4
1561; AVX1-NEXT: vmovd %xmm4, %eax
1562; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm4
1563; AVX1-NEXT: movw %ax, 24(%rdi)
1564; AVX1-NEXT: vmovd %xmm4, %eax
1565; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm4
1566; AVX1-NEXT: movw %ax, 16(%rdi)
1567; AVX1-NEXT: vmovd %xmm4, %eax
1568; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm4
1569; AVX1-NEXT: movw %ax, 8(%rdi)
1570; AVX1-NEXT: vmovd %xmm4, %eax
1571; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
1572; AVX1-NEXT: vcvtps2ph $4, %xmm4, %xmm4
1573; AVX1-NEXT: movw %ax, (%rdi)
1574; AVX1-NEXT: vmovd %xmm4, %eax
1575; AVX1-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
1576; AVX1-NEXT: vcvtps2ph $4, %xmm4, %xmm4
1577; AVX1-NEXT: movw %ax, 30(%rdi)
1578; AVX1-NEXT: vmovd %xmm4, %eax
1579; AVX1-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
1580; AVX1-NEXT: vcvtps2ph $4, %xmm4, %xmm4
1581; AVX1-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
1582; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3
1583; AVX1-NEXT: movw %ax, 28(%rdi)
1584; AVX1-NEXT: vmovd %xmm3, %eax
1585; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
1586; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3
1587; AVX1-NEXT: movw %ax, 26(%rdi)
1588; AVX1-NEXT: vmovd %xmm3, %eax
1589; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
1590; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3
1591; AVX1-NEXT: movw %ax, 22(%rdi)
1592; AVX1-NEXT: vmovd %xmm3, %eax
1593; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
1594; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3
1595; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
1596; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1597; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
1598; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1599; AVX1-NEXT: movw %ax, 20(%rdi)
1600; AVX1-NEXT: vmovd %xmm1, %eax
1601; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,1,2,3]
1602; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1603; AVX1-NEXT: movw %ax, 18(%rdi)
1604; AVX1-NEXT: vmovd %xmm1, %eax
1605; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
1606; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1607; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1608; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1609; AVX1-NEXT: movw %ax, 14(%rdi)
1610; AVX1-NEXT: vmovd %xmm2, %eax
1611; AVX1-NEXT: movw %ax, 12(%rdi)
1612; AVX1-NEXT: vmovd %xmm1, %eax
1613; AVX1-NEXT: movw %ax, 10(%rdi)
1614; AVX1-NEXT: vmovd %xmm0, %eax
1615; AVX1-NEXT: movw %ax, 6(%rdi)
1616; AVX1-NEXT: vmovd %xmm3, %eax
1617; AVX1-NEXT: movw %ax, 4(%rdi)
1618; AVX1-NEXT: vmovd %xmm4, %eax
1619; AVX1-NEXT: movw %ax, 2(%rdi)
1620; AVX1-NEXT: vzeroupper
1621; AVX1-NEXT: retq
1622;
1623; AVX2-LABEL: store_cvt_16f32_to_16i16:
1624; AVX2: # BB#0:
1625; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
1626; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm3
1627; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm4
1628; AVX2-NEXT: vmovd %xmm4, %eax
1629; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm4
1630; AVX2-NEXT: movw %ax, 24(%rdi)
1631; AVX2-NEXT: vmovd %xmm4, %eax
1632; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm4
1633; AVX2-NEXT: movw %ax, 16(%rdi)
1634; AVX2-NEXT: vmovd %xmm4, %eax
1635; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm4
1636; AVX2-NEXT: movw %ax, 8(%rdi)
1637; AVX2-NEXT: vmovd %xmm4, %eax
1638; AVX2-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
1639; AVX2-NEXT: vcvtps2ph $4, %xmm4, %xmm4
1640; AVX2-NEXT: movw %ax, (%rdi)
1641; AVX2-NEXT: vmovd %xmm4, %eax
1642; AVX2-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
1643; AVX2-NEXT: vcvtps2ph $4, %xmm4, %xmm4
1644; AVX2-NEXT: movw %ax, 30(%rdi)
1645; AVX2-NEXT: vmovd %xmm4, %eax
1646; AVX2-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
1647; AVX2-NEXT: vcvtps2ph $4, %xmm4, %xmm4
1648; AVX2-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
1649; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3
1650; AVX2-NEXT: movw %ax, 28(%rdi)
1651; AVX2-NEXT: vmovd %xmm3, %eax
1652; AVX2-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
1653; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3
1654; AVX2-NEXT: movw %ax, 26(%rdi)
1655; AVX2-NEXT: vmovd %xmm3, %eax
1656; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
1657; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3
1658; AVX2-NEXT: movw %ax, 22(%rdi)
1659; AVX2-NEXT: vmovd %xmm3, %eax
1660; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
1661; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3
1662; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
1663; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1664; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
1665; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1666; AVX2-NEXT: movw %ax, 20(%rdi)
1667; AVX2-NEXT: vmovd %xmm1, %eax
1668; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,1,2,3]
1669; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1670; AVX2-NEXT: movw %ax, 18(%rdi)
1671; AVX2-NEXT: vmovd %xmm1, %eax
1672; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
1673; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1674; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
1675; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1676; AVX2-NEXT: movw %ax, 14(%rdi)
1677; AVX2-NEXT: vmovd %xmm2, %eax
1678; AVX2-NEXT: movw %ax, 12(%rdi)
1679; AVX2-NEXT: vmovd %xmm1, %eax
1680; AVX2-NEXT: movw %ax, 10(%rdi)
1681; AVX2-NEXT: vmovd %xmm0, %eax
1682; AVX2-NEXT: movw %ax, 6(%rdi)
1683; AVX2-NEXT: vmovd %xmm3, %eax
1684; AVX2-NEXT: movw %ax, 4(%rdi)
1685; AVX2-NEXT: vmovd %xmm4, %eax
1686; AVX2-NEXT: movw %ax, 2(%rdi)
1687; AVX2-NEXT: vzeroupper
1688; AVX2-NEXT: retq
1689;
1690; AVX512-LABEL: store_cvt_16f32_to_16i16:
1691; AVX512: # BB#0:
1692; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
1693; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm2
1694; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm3
1695; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm4
1696; AVX512-NEXT: vmovd %xmm4, %eax
1697; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm4
1698; AVX512-NEXT: movw %ax, 24(%rdi)
1699; AVX512-NEXT: vmovd %xmm4, %eax
1700; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm4
1701; AVX512-NEXT: movw %ax, 16(%rdi)
1702; AVX512-NEXT: vmovd %xmm4, %eax
1703; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm4
1704; AVX512-NEXT: movw %ax, 8(%rdi)
1705; AVX512-NEXT: vmovd %xmm4, %eax
1706; AVX512-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
1707; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
1708; AVX512-NEXT: movw %ax, (%rdi)
1709; AVX512-NEXT: vmovd %xmm4, %eax
1710; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
1711; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
1712; AVX512-NEXT: movw %ax, 30(%rdi)
1713; AVX512-NEXT: vmovd %xmm4, %eax
1714; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
1715; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
1716; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
1717; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
1718; AVX512-NEXT: movw %ax, 28(%rdi)
1719; AVX512-NEXT: vmovd %xmm3, %eax
1720; AVX512-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3]
1721; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
1722; AVX512-NEXT: movw %ax, 26(%rdi)
1723; AVX512-NEXT: vmovd %xmm3, %eax
1724; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
1725; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
1726; AVX512-NEXT: movw %ax, 22(%rdi)
1727; AVX512-NEXT: vmovd %xmm3, %eax
1728; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
1729; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
1730; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
1731; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
1732; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
1733; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1734; AVX512-NEXT: movw %ax, 20(%rdi)
1735; AVX512-NEXT: vmovd %xmm2, %eax
1736; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
1737; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1738; AVX512-NEXT: movw %ax, 18(%rdi)
1739; AVX512-NEXT: vmovd %xmm2, %eax
1740; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1741; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
1742; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
1743; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
1744; AVX512-NEXT: movw %ax, 14(%rdi)
1745; AVX512-NEXT: vmovd %xmm1, %eax
1746; AVX512-NEXT: movw %ax, 12(%rdi)
1747; AVX512-NEXT: vmovd %xmm2, %eax
1748; AVX512-NEXT: movw %ax, 10(%rdi)
1749; AVX512-NEXT: vmovd %xmm0, %eax
1750; AVX512-NEXT: movw %ax, 6(%rdi)
1751; AVX512-NEXT: vmovd %xmm3, %eax
1752; AVX512-NEXT: movw %ax, 4(%rdi)
1753; AVX512-NEXT: vmovd %xmm4, %eax
1754; AVX512-NEXT: movw %ax, 2(%rdi)
1755; AVX512-NEXT: retq
1756 %1 = fptrunc <16 x float> %a0 to <16 x half>
1757 %2 = bitcast <16 x half> %1 to <16 x i16>
1758 store <16 x i16> %2, <16 x i16>* %a1
1759 ret void
1760}