Oliver Stannard | 89d1542 | 2014-08-27 16:16:04 +0000 | [diff] [blame] | 1 | ; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s |
| 2 | |
| 3 | ; Simple load of v4i16 |
| 4 | define <4 x half> @load_64(<4 x half>* nocapture readonly %a) #0 { |
| 5 | ; CHECK-LABEL: load_64: |
| 6 | ; CHECK: ldr d0, [x0] |
| 7 | entry: |
David Blaikie | a79ac14 | 2015-02-27 21:17:42 +0000 | [diff] [blame] | 8 | %0 = load <4 x half>, <4 x half>* %a, align 8 |
Oliver Stannard | 89d1542 | 2014-08-27 16:16:04 +0000 | [diff] [blame] | 9 | ret <4 x half> %0 |
| 10 | } |
| 11 | |
| 12 | ; Simple load of v8i16 |
| 13 | define <8 x half> @load_128(<8 x half>* nocapture readonly %a) #0 { |
| 14 | ; CHECK-LABEL: load_128: |
| 15 | ; CHECK: ldr q0, [x0] |
| 16 | entry: |
David Blaikie | a79ac14 | 2015-02-27 21:17:42 +0000 | [diff] [blame] | 17 | %0 = load <8 x half>, <8 x half>* %a, align 16 |
Oliver Stannard | 89d1542 | 2014-08-27 16:16:04 +0000 | [diff] [blame] | 18 | ret <8 x half> %0 |
| 19 | } |
| 20 | |
| 21 | ; Duplicating load to v4i16 |
| 22 | define <4 x half> @load_dup_64(half* nocapture readonly %a) #0 { |
| 23 | ; CHECK-LABEL: load_dup_64: |
| 24 | ; CHECK: ld1r { v0.4h }, [x0] |
| 25 | entry: |
David Blaikie | a79ac14 | 2015-02-27 21:17:42 +0000 | [diff] [blame] | 26 | %0 = load half, half* %a, align 2 |
Oliver Stannard | 89d1542 | 2014-08-27 16:16:04 +0000 | [diff] [blame] | 27 | %1 = insertelement <4 x half> undef, half %0, i32 0 |
| 28 | %2 = shufflevector <4 x half> %1, <4 x half> undef, <4 x i32> zeroinitializer |
| 29 | ret <4 x half> %2 |
| 30 | } |
| 31 | |
| 32 | ; Duplicating load to v8i16 |
| 33 | define <8 x half> @load_dup_128(half* nocapture readonly %a) #0 { |
| 34 | ; CHECK-LABEL: load_dup_128: |
| 35 | ; CHECK: ld1r { v0.8h }, [x0] |
| 36 | entry: |
David Blaikie | a79ac14 | 2015-02-27 21:17:42 +0000 | [diff] [blame] | 37 | %0 = load half, half* %a, align 2 |
Oliver Stannard | 89d1542 | 2014-08-27 16:16:04 +0000 | [diff] [blame] | 38 | %1 = insertelement <8 x half> undef, half %0, i32 0 |
| 39 | %2 = shufflevector <8 x half> %1, <8 x half> undef, <8 x i32> zeroinitializer |
| 40 | ret <8 x half> %2 |
| 41 | } |
| 42 | |
| 43 | ; Load to one lane of v4f16 |
| 44 | define <4 x half> @load_lane_64(half* nocapture readonly %a, <4 x half> %b) #0 { |
| 45 | ; CHECK-LABEL: load_lane_64: |
| 46 | ; CHECK: ld1 { v0.h }[2], [x0] |
| 47 | entry: |
David Blaikie | a79ac14 | 2015-02-27 21:17:42 +0000 | [diff] [blame] | 48 | %0 = load half, half* %a, align 2 |
Oliver Stannard | 89d1542 | 2014-08-27 16:16:04 +0000 | [diff] [blame] | 49 | %1 = insertelement <4 x half> %b, half %0, i32 2 |
| 50 | ret <4 x half> %1 |
| 51 | } |
| 52 | |
| 53 | ; Load to one lane of v8f16 |
| 54 | define <8 x half> @load_lane_128(half* nocapture readonly %a, <8 x half> %b) #0 { |
| 55 | ; CHECK-LABEL: load_lane_128: |
| 56 | ; CHECK: ld1 { v0.h }[5], [x0] |
| 57 | entry: |
David Blaikie | a79ac14 | 2015-02-27 21:17:42 +0000 | [diff] [blame] | 58 | %0 = load half, half* %a, align 2 |
Oliver Stannard | 89d1542 | 2014-08-27 16:16:04 +0000 | [diff] [blame] | 59 | %1 = insertelement <8 x half> %b, half %0, i32 5 |
| 60 | ret <8 x half> %1 |
| 61 | } |
| 62 | |
| 63 | ; Simple store of v4f16 |
| 64 | define void @store_64(<4 x half>* nocapture %a, <4 x half> %b) #1 { |
| 65 | ; CHECK-LABEL: store_64: |
| 66 | ; CHECK: str d0, [x0] |
| 67 | entry: |
| 68 | store <4 x half> %b, <4 x half>* %a, align 8 |
| 69 | ret void |
| 70 | } |
| 71 | |
| 72 | ; Simple store of v8f16 |
| 73 | define void @store_128(<8 x half>* nocapture %a, <8 x half> %b) #1 { |
| 74 | ; CHECK-LABEL: store_128: |
| 75 | ; CHECK: str q0, [x0] |
| 76 | entry: |
| 77 | store <8 x half> %b, <8 x half>* %a, align 16 |
| 78 | ret void |
| 79 | } |
| 80 | |
| 81 | ; Store from one lane of v4f16 |
| 82 | define void @store_lane_64(half* nocapture %a, <4 x half> %b) #1 { |
| 83 | ; CHECK-LABEL: store_lane_64: |
| 84 | ; CHECK: st1 { v0.h }[2], [x0] |
| 85 | entry: |
| 86 | %0 = extractelement <4 x half> %b, i32 2 |
| 87 | store half %0, half* %a, align 2 |
| 88 | ret void |
| 89 | } |
| 90 | |
| 91 | ; Store from one lane of v8f16 |
| 92 | define void @store_lane_128(half* nocapture %a, <8 x half> %b) #1 { |
| 93 | ; CHECK-LABEL: store_lane_128: |
| 94 | ; CHECK: st1 { v0.h }[5], [x0] |
| 95 | entry: |
| 96 | %0 = extractelement <8 x half> %b, i32 5 |
| 97 | store half %0, half* %a, align 2 |
| 98 | ret void |
| 99 | } |
| 100 | |
| 101 | ; NEON intrinsics - (de-)interleaving loads and stores |
| 102 | declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0v4f16(<4 x half>*) |
| 103 | declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0v4f16(<4 x half>*) |
| 104 | declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0v4f16(<4 x half>*) |
| 105 | declare void @llvm.aarch64.neon.st2.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>*) |
| 106 | declare void @llvm.aarch64.neon.st3.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>*) |
| 107 | declare void @llvm.aarch64.neon.st4.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, <4 x half>*) |
| 108 | declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0v8f16(<8 x half>*) |
| 109 | declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0v8f16(<8 x half>*) |
| 110 | declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0v8f16(<8 x half>*) |
| 111 | declare void @llvm.aarch64.neon.st2.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>*) |
| 112 | declare void @llvm.aarch64.neon.st3.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>*) |
| 113 | declare void @llvm.aarch64.neon.st4.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, <8 x half>*) |
| 114 | |
| 115 | ; Load 2 x v4f16 with de-interleaving |
| 116 | define { <4 x half>, <4 x half> } @load_interleave_64_2(<4 x half>* %a) #0 { |
| 117 | ; CHECK-LABEL: load_interleave_64_2: |
| 118 | ; CHECK: ld2 { v0.4h, v1.4h }, [x0] |
| 119 | entry: |
| 120 | %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0v4f16(<4 x half>* %a) |
| 121 | ret { <4 x half>, <4 x half> } %0 |
| 122 | } |
| 123 | |
| 124 | ; Load 3 x v4f16 with de-interleaving |
| 125 | define { <4 x half>, <4 x half>, <4 x half> } @load_interleave_64_3(<4 x half>* %a) #0 { |
| 126 | ; CHECK-LABEL: load_interleave_64_3: |
| 127 | ; CHECK: ld3 { v0.4h, v1.4h, v2.4h }, [x0] |
| 128 | entry: |
| 129 | %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0v4f16(<4 x half>* %a) |
| 130 | ret { <4 x half>, <4 x half>, <4 x half> } %0 |
| 131 | } |
| 132 | |
| 133 | ; Load 4 x v4f16 with de-interleaving |
| 134 | define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_interleave_64_4(<4 x half>* %a) #0 { |
| 135 | ; CHECK-LABEL: load_interleave_64_4: |
| 136 | ; CHECK: ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] |
| 137 | entry: |
| 138 | %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0v4f16(<4 x half>* %a) |
| 139 | ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0 |
| 140 | } |
| 141 | |
| 142 | ; Store 2 x v4f16 with interleaving |
| 143 | define void @store_interleave_64_2(<4 x half>* %a, <4 x half> %b, <4 x half> %c) #0 { |
| 144 | ; CHECK-LABEL: store_interleave_64_2: |
| 145 | ; CHECK: st2 { v0.4h, v1.4h }, [x0] |
| 146 | entry: |
| 147 | tail call void @llvm.aarch64.neon.st2.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half>* %a) |
| 148 | ret void |
| 149 | } |
| 150 | |
| 151 | ; Store 3 x v4f16 with interleaving |
| 152 | define void @store_interleave_64_3(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 { |
| 153 | ; CHECK-LABEL: store_interleave_64_3: |
| 154 | ; CHECK: st3 { v0.4h, v1.4h, v2.4h }, [x0] |
| 155 | entry: |
| 156 | tail call void @llvm.aarch64.neon.st3.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half>* %a) |
| 157 | ret void |
| 158 | } |
| 159 | |
| 160 | ; Store 4 x v4f16 with interleaving |
| 161 | define void @store_interleave_64_4(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 { |
| 162 | ; CHECK-LABEL: store_interleave_64_4: |
| 163 | ; CHECK: st4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] |
| 164 | entry: |
| 165 | tail call void @llvm.aarch64.neon.st4.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, <4 x half>* %a) |
| 166 | ret void |
| 167 | } |
| 168 | |
| 169 | ; Load 2 x v8f16 with de-interleaving |
| 170 | define { <8 x half>, <8 x half> } @load_interleave_128_2(<8 x half>* %a) #0 { |
| 171 | ; CHECK-LABEL: load_interleave_128_2: |
| 172 | ; CHECK: ld2 { v0.8h, v1.8h }, [x0] |
| 173 | entry: |
| 174 | %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0v8f16(<8 x half>* %a) |
| 175 | ret { <8 x half>, <8 x half> } %0 |
| 176 | } |
| 177 | |
| 178 | ; Load 3 x v8f16 with de-interleaving |
| 179 | define { <8 x half>, <8 x half>, <8 x half> } @load_interleave_128_3(<8 x half>* %a) #0 { |
| 180 | ; CHECK-LABEL: load_interleave_128_3: |
| 181 | ; CHECK: ld3 { v0.8h, v1.8h, v2.8h }, [x0] |
| 182 | entry: |
| 183 | %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0v8f16(<8 x half>* %a) |
| 184 | ret { <8 x half>, <8 x half>, <8 x half> } %0 |
| 185 | } |
| 186 | |
| 187 | ; Load 8 x v8f16 with de-interleaving |
| 188 | define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_interleave_128_4(<8 x half>* %a) #0 { |
| 189 | ; CHECK-LABEL: load_interleave_128_4: |
| 190 | ; CHECK: ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] |
| 191 | entry: |
| 192 | %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0v8f16(<8 x half>* %a) |
| 193 | ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0 |
| 194 | } |
| 195 | |
| 196 | ; Store 2 x v8f16 with interleaving |
| 197 | define void @store_interleave_128_2(<8 x half>* %a, <8 x half> %b, <8 x half> %c) #0 { |
| 198 | ; CHECK-LABEL: store_interleave_128_2: |
| 199 | ; CHECK: st2 { v0.8h, v1.8h }, [x0] |
| 200 | entry: |
| 201 | tail call void @llvm.aarch64.neon.st2.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half>* %a) |
| 202 | ret void |
| 203 | } |
| 204 | |
| 205 | ; Store 3 x v8f16 with interleaving |
| 206 | define void @store_interleave_128_3(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 { |
| 207 | ; CHECK-LABEL: store_interleave_128_3: |
| 208 | ; CHECK: st3 { v0.8h, v1.8h, v2.8h }, [x0] |
| 209 | entry: |
| 210 | tail call void @llvm.aarch64.neon.st3.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half>* %a) |
| 211 | ret void |
| 212 | } |
| 213 | |
| 214 | ; Store 8 x v8f16 with interleaving |
| 215 | define void @store_interleave_128_4(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 { |
| 216 | ; CHECK-LABEL: store_interleave_128_4: |
| 217 | ; CHECK: st4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] |
| 218 | entry: |
| 219 | tail call void @llvm.aarch64.neon.st4.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, <8 x half>* %a) |
| 220 | ret void |
| 221 | } |
| 222 | |
| 223 | ; NEON intrinsics - duplicating loads |
| 224 | declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2r.v4f16.p0f16(half*) |
| 225 | declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3r.v4f16.p0f16(half*) |
| 226 | declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4r.v4f16.p0f16(half*) |
| 227 | declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2r.v8f16.p0f16(half*) |
| 228 | declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3r.v8f16.p0f16(half*) |
| 229 | declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4r.v8f16.p0f16(half*) |
| 230 | |
| 231 | ; Load 2 x v4f16 with duplication |
| 232 | define { <4 x half>, <4 x half> } @load_dup_64_2(half* %a) #0 { |
| 233 | ; CHECK-LABEL: load_dup_64_2: |
| 234 | ; CHECK: ld2r { v0.4h, v1.4h }, [x0] |
| 235 | entry: |
| 236 | %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2r.v4f16.p0f16(half* %a) |
| 237 | ret { <4 x half>, <4 x half> } %0 |
| 238 | } |
| 239 | |
| 240 | ; Load 3 x v4f16 with duplication |
| 241 | define { <4 x half>, <4 x half>, <4 x half> } @load_dup_64_3(half* %a) #0 { |
| 242 | ; CHECK-LABEL: load_dup_64_3: |
| 243 | ; CHECK: ld3r { v0.4h, v1.4h, v2.4h }, [x0] |
| 244 | entry: |
| 245 | %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3r.v4f16.p0f16(half* %a) |
| 246 | ret { <4 x half>, <4 x half>, <4 x half> } %0 |
| 247 | } |
| 248 | |
| 249 | ; Load 4 x v4f16 with duplication |
| 250 | define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_dup_64_4(half* %a) #0 { |
| 251 | ; CHECK-LABEL: load_dup_64_4: |
| 252 | ; CHECK: ld4r { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] |
| 253 | entry: |
| 254 | %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4r.v4f16.p0f16(half* %a) |
| 255 | ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0 |
| 256 | } |
| 257 | |
| 258 | ; Load 2 x v8f16 with duplication |
| 259 | define { <8 x half>, <8 x half> } @load_dup_128_2(half* %a) #0 { |
| 260 | ; CHECK-LABEL: load_dup_128_2: |
| 261 | ; CHECK: ld2r { v0.8h, v1.8h }, [x0] |
| 262 | entry: |
| 263 | %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2r.v8f16.p0f16(half* %a) |
| 264 | ret { <8 x half>, <8 x half> } %0 |
| 265 | } |
| 266 | |
| 267 | ; Load 3 x v8f16 with duplication |
| 268 | define { <8 x half>, <8 x half>, <8 x half> } @load_dup_128_3(half* %a) #0 { |
| 269 | ; CHECK-LABEL: load_dup_128_3: |
| 270 | ; CHECK: ld3r { v0.8h, v1.8h, v2.8h }, [x0] |
| 271 | entry: |
| 272 | %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3r.v8f16.p0f16(half* %a) |
| 273 | ret { <8 x half>, <8 x half>, <8 x half> } %0 |
| 274 | } |
| 275 | |
| 276 | ; Load 8 x v8f16 with duplication |
| 277 | define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_dup_128_4(half* %a) #0 { |
| 278 | ; CHECK-LABEL: load_dup_128_4: |
| 279 | ; CHECK: ld4r { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] |
| 280 | entry: |
| 281 | %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4r.v8f16.p0f16(half* %a) |
| 282 | ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0 |
| 283 | } |
| 284 | |
| 285 | |
| 286 | ; NEON intrinsics - loads and stores to/from one lane |
| 287 | declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0f16(<4 x half>, <4 x half>, i64, half*) |
| 288 | declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, i64, half*) |
| 289 | declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, i64, half*) |
| 290 | declare void @llvm.aarch64.neon.st2lane.v4f16.p0f16(<4 x half>, <4 x half>, i64, half*) |
| 291 | declare void @llvm.aarch64.neon.st3lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, i64, half*) |
| 292 | declare void @llvm.aarch64.neon.st4lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, i64, half*) |
| 293 | declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0f16(<8 x half>, <8 x half>, i64, half*) |
| 294 | declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, i64, half*) |
| 295 | declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, i64, half*) |
| 296 | declare void @llvm.aarch64.neon.st2lane.v8f16.p0f16(<8 x half>, <8 x half>, i64, half*) |
| 297 | declare void @llvm.aarch64.neon.st3lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, i64, half*) |
| 298 | declare void @llvm.aarch64.neon.st4lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, i64, half*) |
| 299 | |
| 300 | ; Load one lane of 2 x v4f16 |
| 301 | define { <4 x half>, <4 x half> } @load_lane_64_2(half* %a, <4 x half> %b, <4 x half> %c) #0 { |
| 302 | ; CHECK-LABEL: load_lane_64_2: |
| 303 | ; CHECK: ld2 { v0.h, v1.h }[2], [x0] |
| 304 | entry: |
| 305 | %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, i64 2, half* %a) |
| 306 | ret { <4 x half>, <4 x half> } %0 |
| 307 | } |
| 308 | |
| 309 | ; Load one lane of 3 x v4f16 |
| 310 | define { <4 x half>, <4 x half>, <4 x half> } @load_lane_64_3(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 { |
| 311 | ; CHECK-LABEL: load_lane_64_3: |
| 312 | ; CHECK: ld3 { v0.h, v1.h, v2.h }[2], [x0] |
| 313 | entry: |
| 314 | %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, i64 2, half* %a) |
| 315 | ret { <4 x half>, <4 x half>, <4 x half> } %0 |
| 316 | } |
| 317 | |
| 318 | ; Load one lane of 4 x v4f16 |
| 319 | define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_lane_64_4(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 { |
| 320 | ; CHECK-LABEL: load_lane_64_4: |
| 321 | ; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[2], [x0] |
| 322 | entry: |
| 323 | %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, i64 2, half* %a) |
| 324 | ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0 |
| 325 | } |
| 326 | |
| 327 | ; Store one lane of 2 x v4f16 |
| 328 | define void @store_lane_64_2(half* %a, <4 x half> %b, <4 x half> %c) #0 { |
| 329 | ; CHECK-LABEL: store_lane_64_2: |
| 330 | ; CHECK: st2 { v0.h, v1.h }[2], [x0] |
| 331 | entry: |
| 332 | tail call void @llvm.aarch64.neon.st2lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, i64 2, half* %a) |
| 333 | ret void |
| 334 | } |
| 335 | |
| 336 | ; Store one lane of 3 x v4f16 |
| 337 | define void @store_lane_64_3(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 { |
| 338 | ; CHECK-LABEL: store_lane_64_3: |
| 339 | ; CHECK: st3 { v0.h, v1.h, v2.h }[2], [x0] |
| 340 | entry: |
| 341 | tail call void @llvm.aarch64.neon.st3lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, i64 2, half* %a) |
| 342 | ret void |
| 343 | } |
| 344 | |
| 345 | ; Store one lane of 4 x v4f16 |
| 346 | define void @store_lane_64_4(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 { |
| 347 | ; CHECK-LABEL: store_lane_64_4: |
| 348 | ; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[2], [x0] |
| 349 | entry: |
| 350 | tail call void @llvm.aarch64.neon.st4lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, i64 2, half* %a) |
| 351 | ret void |
| 352 | } |
| 353 | |
| 354 | ; Load one lane of 2 x v8f16 |
| 355 | define { <8 x half>, <8 x half> } @load_lane_128_2(half* %a, <8 x half> %b, <8 x half> %c) #0 { |
| 356 | ; CHECK-LABEL: load_lane_128_2: |
| 357 | ; CHECK: ld2 { v0.h, v1.h }[2], [x0] |
| 358 | entry: |
| 359 | %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, i64 2, half* %a) |
| 360 | ret { <8 x half>, <8 x half> } %0 |
| 361 | } |
| 362 | |
| 363 | ; Load one lane of 3 x v8f16 |
| 364 | define { <8 x half>, <8 x half>, <8 x half> } @load_lane_128_3(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 { |
| 365 | ; CHECK-LABEL: load_lane_128_3: |
| 366 | ; CHECK: ld3 { v0.h, v1.h, v2.h }[2], [x0] |
| 367 | entry: |
| 368 | %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, i64 2, half* %a) |
| 369 | ret { <8 x half>, <8 x half>, <8 x half> } %0 |
| 370 | } |
| 371 | |
| 372 | ; Load one lane of 8 x v8f16 |
| 373 | define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_lane_128_4(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 { |
| 374 | ; CHECK-LABEL: load_lane_128_4: |
| 375 | ; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[2], [x0] |
| 376 | entry: |
| 377 | %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, i64 2, half* %a) |
| 378 | ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0 |
| 379 | } |
| 380 | |
| 381 | ; Store one lane of 2 x v8f16 |
| 382 | define void @store_lane_128_2(half* %a, <8 x half> %b, <8 x half> %c) #0 { |
| 383 | ; CHECK-LABEL: store_lane_128_2: |
| 384 | ; CHECK: st2 { v0.h, v1.h }[2], [x0] |
| 385 | entry: |
| 386 | tail call void @llvm.aarch64.neon.st2lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, i64 2, half* %a) |
| 387 | ret void |
| 388 | } |
| 389 | |
| 390 | ; Store one lane of 3 x v8f16 |
| 391 | define void @store_lane_128_3(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 { |
| 392 | ; CHECK-LABEL: store_lane_128_3: |
| 393 | ; CHECK: st3 { v0.h, v1.h, v2.h }[2], [x0] |
| 394 | entry: |
| 395 | tail call void @llvm.aarch64.neon.st3lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, i64 2, half* %a) |
| 396 | ret void |
| 397 | } |
| 398 | |
| 399 | ; Store one lane of 8 x v8f16 |
| 400 | define void @store_lane_128_4(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 { |
| 401 | ; CHECK-LABEL: store_lane_128_4: |
| 402 | ; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[2], [x0] |
| 403 | entry: |
| 404 | tail call void @llvm.aarch64.neon.st4lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, i64 2, half* %a) |
| 405 | ret void |
| 406 | } |
| 407 | |
| 408 | ; NEON intrinsics - load/store without interleaving |
| 409 | declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x2.v4f16.p0v4f16(<4 x half>*) |
| 410 | declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x3.v4f16.p0v4f16(<4 x half>*) |
| 411 | declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x4.v4f16.p0v4f16(<4 x half>*) |
| 412 | declare void @llvm.aarch64.neon.st1x2.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>*) |
| 413 | declare void @llvm.aarch64.neon.st1x3.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>*) |
| 414 | declare void @llvm.aarch64.neon.st1x4.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, <4 x half>*) |
| 415 | declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x2.v8f16.p0v8f16(<8 x half>*) |
| 416 | declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x3.v8f16.p0v8f16(<8 x half>*) |
| 417 | declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x4.v8f16.p0v8f16(<8 x half>*) |
| 418 | declare void @llvm.aarch64.neon.st1x2.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>*) |
| 419 | declare void @llvm.aarch64.neon.st1x3.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>*) |
| 420 | declare void @llvm.aarch64.neon.st1x4.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, <8 x half>*) |
| 421 | |
| 422 | ; Load 2 x v4f16 without de-interleaving |
| 423 | define { <4 x half>, <4 x half> } @load_64_2(<4 x half>* %a) #0 { |
| 424 | ; CHECK-LABEL: load_64_2: |
| 425 | ; CHECK: ld1 { v0.4h, v1.4h }, [x0] |
| 426 | entry: |
| 427 | %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x2.v4f16.p0v4f16(<4 x half>* %a) |
| 428 | ret { <4 x half>, <4 x half> } %0 |
| 429 | } |
| 430 | |
| 431 | ; Load 3 x v4f16 without de-interleaving |
| 432 | define { <4 x half>, <4 x half>, <4 x half> } @load_64_3(<4 x half>* %a) #0 { |
| 433 | ; CHECK-LABEL: load_64_3: |
| 434 | ; CHECK: ld1 { v0.4h, v1.4h, v2.4h }, [x0] |
| 435 | entry: |
| 436 | %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x3.v4f16.p0v4f16(<4 x half>* %a) |
| 437 | ret { <4 x half>, <4 x half>, <4 x half> } %0 |
| 438 | } |
| 439 | |
| 440 | ; Load 4 x v4f16 without de-interleaving |
| 441 | define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_64_4(<4 x half>* %a) #0 { |
| 442 | ; CHECK-LABEL: load_64_4: |
| 443 | ; CHECK: ld1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] |
| 444 | entry: |
| 445 | %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x4.v4f16.p0v4f16(<4 x half>* %a) |
| 446 | ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0 |
| 447 | } |
| 448 | |
| 449 | ; Store 2 x v4f16 without interleaving |
| 450 | define void @store_64_2(<4 x half>* %a, <4 x half> %b, <4 x half> %c) #0 { |
| 451 | ; CHECK-LABEL: store_64_2: |
| 452 | ; CHECK: st1 { v0.4h, v1.4h }, [x0] |
| 453 | entry: |
| 454 | tail call void @llvm.aarch64.neon.st1x2.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half>* %a) |
| 455 | ret void |
| 456 | } |
| 457 | |
| 458 | ; Store 3 x v4f16 without interleaving |
| 459 | define void @store_64_3(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 { |
| 460 | ; CHECK-LABEL: store_64_3: |
| 461 | ; CHECK: st1 { v0.4h, v1.4h, v2.4h }, [x0] |
| 462 | entry: |
| 463 | tail call void @llvm.aarch64.neon.st1x3.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half>* %a) |
| 464 | ret void |
| 465 | } |
| 466 | |
| 467 | ; Store 4 x v4f16 without interleaving |
| 468 | define void @store_64_4(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 { |
| 469 | ; CHECK-LABEL: store_64_4: |
| 470 | ; CHECK: st1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] |
| 471 | entry: |
| 472 | tail call void @llvm.aarch64.neon.st1x4.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, <4 x half>* %a) |
| 473 | ret void |
| 474 | } |
| 475 | |
| 476 | ; Load 2 x v8f16 without de-interleaving |
| 477 | define { <8 x half>, <8 x half> } @load_128_2(<8 x half>* %a) #0 { |
| 478 | ; CHECK-LABEL: load_128_2: |
| 479 | ; CHECK: ld1 { v0.8h, v1.8h }, [x0] |
| 480 | entry: |
| 481 | %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x2.v8f16.p0v8f16(<8 x half>* %a) |
| 482 | ret { <8 x half>, <8 x half> } %0 |
| 483 | } |
| 484 | |
| 485 | ; Load 3 x v8f16 without de-interleaving |
| 486 | define { <8 x half>, <8 x half>, <8 x half> } @load_128_3(<8 x half>* %a) #0 { |
| 487 | ; CHECK-LABEL: load_128_3: |
| 488 | ; CHECK: ld1 { v0.8h, v1.8h, v2.8h }, [x0] |
| 489 | entry: |
| 490 | %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x3.v8f16.p0v8f16(<8 x half>* %a) |
| 491 | ret { <8 x half>, <8 x half>, <8 x half> } %0 |
| 492 | } |
| 493 | |
| 494 | ; Load 8 x v8f16 without de-interleaving |
| 495 | define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_128_4(<8 x half>* %a) #0 { |
| 496 | ; CHECK-LABEL: load_128_4: |
| 497 | ; CHECK: ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] |
| 498 | entry: |
| 499 | %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x4.v8f16.p0v8f16(<8 x half>* %a) |
| 500 | ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0 |
| 501 | } |
| 502 | |
| 503 | ; Store 2 x v8f16 without interleaving |
| 504 | define void @store_128_2(<8 x half>* %a, <8 x half> %b, <8 x half> %c) #0 { |
| 505 | ; CHECK-LABEL: store_128_2: |
| 506 | ; CHECK: st1 { v0.8h, v1.8h }, [x0] |
| 507 | entry: |
| 508 | tail call void @llvm.aarch64.neon.st1x2.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half>* %a) |
| 509 | ret void |
| 510 | } |
| 511 | |
| 512 | ; Store 3 x v8f16 without interleaving |
| 513 | define void @store_128_3(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 { |
| 514 | ; CHECK-LABEL: store_128_3: |
| 515 | ; CHECK: st1 { v0.8h, v1.8h, v2.8h }, [x0] |
| 516 | entry: |
| 517 | tail call void @llvm.aarch64.neon.st1x3.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half>* %a) |
| 518 | ret void |
| 519 | } |
| 520 | |
| 521 | ; Store 8 x v8f16 without interleaving |
| 522 | define void @store_128_4(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 { |
| 523 | ; CHECK-LABEL: store_128_4: |
| 524 | ; CHECK: st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] |
| 525 | entry: |
| 526 | tail call void @llvm.aarch64.neon.st1x4.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, <8 x half>* %a) |
| 527 | ret void |
| 528 | } |