blob: f70291b96e163ec155181bd8e7d6fd0519e0bf5e [file] [log] [blame]
Bruno Cardoso Lopes62f67f82011-07-14 18:50:58 +00001; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
2
3; CHECK: vmovaps
4; CHECK: vmovaps
5; CHECK: vmovapd
6; CHECK: vmovapd
7; CHECK: vmovaps
8; CHECK: vmovaps
9define void @test_256_load(double* nocapture %d, float* nocapture %f, <4 x i64>* nocapture %i) nounwind uwtable ssp {
10entry:
11 %0 = bitcast double* %d to <4 x double>*
12 %tmp1.i = load <4 x double>* %0, align 32
13 %1 = bitcast float* %f to <8 x float>*
14 %tmp1.i17 = load <8 x float>* %1, align 32
15 %tmp1.i16 = load <4 x i64>* %i, align 32
16 tail call void @dummy(<4 x double> %tmp1.i, <8 x float> %tmp1.i17, <4 x i64> %tmp1.i16) nounwind
17 store <4 x double> %tmp1.i, <4 x double>* %0, align 32
18 store <8 x float> %tmp1.i17, <8 x float>* %1, align 32
19 store <4 x i64> %tmp1.i16, <4 x i64>* %i, align 32
20 ret void
21}
22
23declare void @dummy(<4 x double>, <8 x float>, <4 x i64>)
24
Bruno Cardoso Lopese5118ab2011-08-09 01:43:09 +000025;;
26;; The two tests below check that we must fold load + scalar_to_vector
27;; + ins_subvec+ zext into only a single vmovss or vmovsd
28
29; CHECK: vmovss (%
30define <8 x float> @mov00(<8 x float> %v, float * %ptr) nounwind {
31 %val = load float* %ptr
32 %i0 = insertelement <8 x float> zeroinitializer, float %val, i32 0
33 ret <8 x float> %i0
34}
35
36; CHECK: vmovsd (%
37define <4 x double> @mov01(<4 x double> %v, double * %ptr) nounwind {
38 %val = load double* %ptr
39 %i0 = insertelement <4 x double> zeroinitializer, double %val, i32 0
40 ret <4 x double> %i0
41}
42
Bruno Cardoso Lopes18deb042011-08-09 22:39:53 +000043; CHECK: vmovaps %ymm
44define void @storev16i16(<16 x i16> %a) nounwind {
45 store <16 x i16> %a, <16 x i16>* undef, align 32
46 unreachable
47}
48
49; CHECK: vmovups %ymm
50define void @storev16i16_01(<16 x i16> %a) nounwind {
51 store <16 x i16> %a, <16 x i16>* undef, align 4
52 unreachable
53}
54
55; CHECK: vmovaps %ymm
56define void @storev32i8(<32 x i8> %a) nounwind {
57 store <32 x i8> %a, <32 x i8>* undef, align 32
58 unreachable
59}
60
61; CHECK: vmovups %ymm
62define void @storev32i8_01(<32 x i8> %a) nounwind {
63 store <32 x i8> %a, <32 x i8>* undef, align 4
64 unreachable
65}
66
Bruno Cardoso Lopesec916402011-08-11 21:50:35 +000067; It is faster to make two saves, if the data is already in XMM registers. For
68; example, after making an integer operation.
69; CHECK: _double_save
70; CHECK-NOT: vinsertf128 $1
71; CHECK-NOT: vinsertf128 $0
72; CHECK: vmovaps %xmm
73; CHECK: vmovaps %xmm
74define void @double_save(<4 x i32> %A, <4 x i32> %B, <8 x i32>* %P) nounwind ssp {
75entry:
76 %Z = shufflevector <4 x i32>%A, <4 x i32>%B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
77 store <8 x i32> %Z, <8 x i32>* %P, align 16
78 ret void
79}
80