Dan Gohman | f17a25c | 2007-07-18 16:29:46 +0000 | [diff] [blame] | 1 | //===- README_ALTIVEC.txt - Notes for improving Altivec code gen ----------===// |
| 2 | |
| 3 | Implement PPCInstrInfo::isLoadFromStackSlot/isStoreToStackSlot for vector |
| 4 | registers, to generate better spill code. |
| 5 | |
| 6 | //===----------------------------------------------------------------------===// |
| 7 | |
| 8 | The first should be a single lvx from the constant pool, the second should be |
| 9 | a xor/stvx: |
| 10 | |
| 11 | void foo(void) { |
| 12 | int x[8] __attribute__((aligned(128))) = { 1, 1, 1, 17, 1, 1, 1, 1 }; |
| 13 | bar (x); |
| 14 | } |
| 15 | |
| 16 | #include <string.h> |
| 17 | void foo(void) { |
| 18 | int x[8] __attribute__((aligned(128))); |
| 19 | memset (x, 0, sizeof (x)); |
| 20 | bar (x); |
| 21 | } |
| 22 | |
| 23 | //===----------------------------------------------------------------------===// |
| 24 | |
| 25 | Altivec: Codegen'ing MUL with vector FMADD should add -0.0, not 0.0: |
| 26 | http://gcc.gnu.org/bugzilla/show_bug.cgi?id=8763 |
| 27 | |
| 28 | When -ffast-math is on, we can use 0.0. |
| 29 | |
| 30 | //===----------------------------------------------------------------------===// |
| 31 | |
| 32 | Consider this: |
| 33 | v4f32 Vector; |
| 34 | v4f32 Vector2 = { Vector.X, Vector.X, Vector.X, Vector.X }; |
| 35 | |
| 36 | Since we know that "Vector" is 16-byte aligned and we know the element offset |
| 37 | of ".X", we should change the load into a lve*x instruction, instead of doing |
| 38 | a load/store/lve*x sequence. |
| 39 | |
| 40 | //===----------------------------------------------------------------------===// |
| 41 | |
| 42 | For functions that use altivec AND have calls, we are VRSAVE'ing all call |
| 43 | clobbered regs. |
| 44 | |
| 45 | //===----------------------------------------------------------------------===// |
| 46 | |
| 47 | Implement passing vectors by value into calls and receiving them as arguments. |
| 48 | |
| 49 | //===----------------------------------------------------------------------===// |
| 50 | |
| 51 | GCC apparently tries to codegen { C1, C2, Variable, C3 } as a constant pool load |
| 52 | of C1/C2/C3, then a load and vperm of Variable. |
| 53 | |
| 54 | //===----------------------------------------------------------------------===// |
| 55 | |
| 56 | We need a way to teach tblgen that some operands of an intrinsic are required to |
| 57 | be constants. The verifier should enforce this constraint. |
| 58 | |
| 59 | //===----------------------------------------------------------------------===// |
| 60 | |
| 61 | We currently codegen SCALAR_TO_VECTOR as a store of the scalar to a 16-byte |
| 62 | aligned stack slot, followed by a load/vperm. We should probably just store it |
| 63 | to a scalar stack slot, then use lvsl/vperm to load it. If the value is already |
| 64 | in memory this is a big win. |
| 65 | |
| 66 | //===----------------------------------------------------------------------===// |
| 67 | |
| 68 | extract_vector_elt of an arbitrary constant vector can be done with the |
| 69 | following instructions: |
| 70 | |
| 71 | vTemp = vec_splat(v0,2); // 2 is the element the src is in. |
| 72 | vec_ste(&destloc,0,vTemp); |
| 73 | |
| 74 | We can do an arbitrary non-constant value by using lvsr/perm/ste. |
| 75 | |
| 76 | //===----------------------------------------------------------------------===// |
| 77 | |
| 78 | If we want to tie instruction selection into the scheduler, we can do some |
| 79 | constant formation with different instructions. For example, we can generate |
| 80 | "vsplti -1" with "vcmpequw R,R" and 1,1,1,1 with "vsubcuw R,R", and 0,0,0,0 with |
| 81 | "vsplti 0" or "vxor", each of which use different execution units, thus could |
| 82 | help scheduling. |
| 83 | |
| 84 | This is probably only reasonable for a post-pass scheduler. |
| 85 | |
| 86 | //===----------------------------------------------------------------------===// |
| 87 | |
| 88 | For this function: |
| 89 | |
| 90 | void test(vector float *A, vector float *B) { |
| 91 | vector float C = (vector float)vec_cmpeq(*A, *B); |
| 92 | if (!vec_any_eq(*A, *B)) |
| 93 | *B = (vector float){0,0,0,0}; |
| 94 | *A = C; |
| 95 | } |
| 96 | |
| 97 | we get the following basic block: |
| 98 | |
| 99 | ... |
| 100 | lvx v2, 0, r4 |
| 101 | lvx v3, 0, r3 |
| 102 | vcmpeqfp v4, v3, v2 |
| 103 | vcmpeqfp. v2, v3, v2 |
| 104 | bne cr6, LBB1_2 ; cond_next |
| 105 | |
| 106 | The vcmpeqfp/vcmpeqfp. instructions currently cannot be merged when the |
| 107 | vcmpeqfp. result is used by a branch. This can be improved. |
| 108 | |
| 109 | //===----------------------------------------------------------------------===// |
| 110 | |
| 111 | The code generated for this is truly aweful: |
| 112 | |
| 113 | vector float test(float a, float b) { |
| 114 | return (vector float){ 0.0, a, 0.0, 0.0}; |
| 115 | } |
| 116 | |
| 117 | LCPI1_0: ; float |
| 118 | .space 4 |
| 119 | .text |
| 120 | .globl _test |
| 121 | .align 4 |
| 122 | _test: |
| 123 | mfspr r2, 256 |
| 124 | oris r3, r2, 4096 |
| 125 | mtspr 256, r3 |
| 126 | lis r3, ha16(LCPI1_0) |
| 127 | addi r4, r1, -32 |
| 128 | stfs f1, -16(r1) |
| 129 | addi r5, r1, -16 |
| 130 | lfs f0, lo16(LCPI1_0)(r3) |
| 131 | stfs f0, -32(r1) |
| 132 | lvx v2, 0, r4 |
| 133 | lvx v3, 0, r5 |
| 134 | vmrghw v3, v3, v2 |
| 135 | vspltw v2, v2, 0 |
| 136 | vmrghw v2, v2, v3 |
| 137 | mtspr 256, r2 |
| 138 | blr |
| 139 | |
| 140 | //===----------------------------------------------------------------------===// |
| 141 | |
| 142 | int foo(vector float *x, vector float *y) { |
| 143 | if (vec_all_eq(*x,*y)) return 3245; |
| 144 | else return 12; |
| 145 | } |
| 146 | |
| 147 | A predicate compare being used in a select_cc should have the same peephole |
| 148 | applied to it as a predicate compare used by a br_cc. There should be no |
| 149 | mfcr here: |
| 150 | |
| 151 | _foo: |
| 152 | mfspr r2, 256 |
| 153 | oris r5, r2, 12288 |
| 154 | mtspr 256, r5 |
| 155 | li r5, 12 |
| 156 | li r6, 3245 |
| 157 | lvx v2, 0, r4 |
| 158 | lvx v3, 0, r3 |
| 159 | vcmpeqfp. v2, v3, v2 |
| 160 | mfcr r3, 2 |
| 161 | rlwinm r3, r3, 25, 31, 31 |
| 162 | cmpwi cr0, r3, 0 |
| 163 | bne cr0, LBB1_2 ; entry |
| 164 | LBB1_1: ; entry |
| 165 | mr r6, r5 |
| 166 | LBB1_2: ; entry |
| 167 | mr r3, r6 |
| 168 | mtspr 256, r2 |
| 169 | blr |
| 170 | |
| 171 | //===----------------------------------------------------------------------===// |
| 172 | |
| 173 | CodeGen/PowerPC/vec_constants.ll has an and operation that should be |
| 174 | codegen'd to andc. The issue is that the 'all ones' build vector is |
| 175 | SelectNodeTo'd a VSPLTISB instruction node before the and/xor is selected |
| 176 | which prevents the vnot pattern from matching. |
| 177 | |
| 178 | |
| 179 | //===----------------------------------------------------------------------===// |
Nate Begeman | d0a08cb | 2008-02-13 07:06:12 +0000 | [diff] [blame] | 180 | |
| 181 | An alternative to the store/store/load approach for illegal insert element |
| 182 | lowering would be: |
| 183 | |
| 184 | 1. store element to any ol' slot |
| 185 | 2. lvx the slot |
| 186 | 3. lvsl 0; splat index; vcmpeq to generate a select mask |
| 187 | 4. lvsl slot + x; vperm to rotate result into correct slot |
| 188 | 5. vsel result together. |
Chris Lattner | 751e32f | 2008-04-25 17:25:00 +0000 | [diff] [blame] | 189 | |
| 190 | //===----------------------------------------------------------------------===// |
| 191 | |
| 192 | Should codegen branches on vec_any/vec_all to avoid mfcr. Two examples: |
| 193 | |
| 194 | #include <altivec.h> |
| 195 | int f(vector float a, vector float b) |
| 196 | { |
| 197 | int aa = 0; |
| 198 | if (vec_all_ge(a, b)) |
| 199 | aa |= 0x1; |
| 200 | if (vec_any_ge(a,b)) |
| 201 | aa |= 0x2; |
| 202 | return aa; |
| 203 | } |
| 204 | |
| 205 | vector float f(vector float a, vector float b) { |
| 206 | if (vec_any_eq(a, b)) |
| 207 | return a; |
| 208 | else |
| 209 | return b; |
| 210 | } |
| 211 | |