blob: d51776f6232c07577a6093d393ff89bc39d885ec [file] [log] [blame]
Nate Begeman63be70d2004-08-10 20:42:36 +00001TODO:
Nate Begeman08698cf2005-04-11 20:48:57 +00002* gpr0 allocation
Nate Begeman4c6e1d62004-10-26 04:10:53 +00003* implement do-loop -> bdnz transform
Nate Begeman412602d2004-08-14 22:16:36 +00004* implement powerpc-64 for darwin
Nate Begeman9aea6e42005-12-24 01:00:15 +00005
Nate Begemanfc567d82006-02-03 05:17:06 +00006===-------------------------------------------------------------------------===
Nate Begeman9aea6e42005-12-24 01:00:15 +00007
Nate Begemanfc567d82006-02-03 05:17:06 +00008Use the stfiwx instruction for:
Chris Lattner1defb7f2005-07-26 19:07:51 +00009
Nate Begemanfc567d82006-02-03 05:17:06 +000010void foo(float a, int *b) { *b = a; }
11
12===-------------------------------------------------------------------------===
13
Nate Begeman83f6b982005-08-14 01:17:16 +000014unsigned short foo(float a) { return a; }
Nate Begemanfc567d82006-02-03 05:17:06 +000015should be:
Nate Begeman83f6b982005-08-14 01:17:16 +000016_foo:
17 fctiwz f0,f1
18 stfd f0,-8(r1)
19 lhz r3,-2(r1)
20 blr
21not:
22_foo:
23 fctiwz f0, f1
24 stfd f0, -8(r1)
25 lwz r2, -4(r1)
26 rlwinm r3, r2, 0, 16, 31
27 blr
28
Nate Begemanfc567d82006-02-03 05:17:06 +000029===-------------------------------------------------------------------------===
Chris Lattner11fc3192005-08-05 19:18:32 +000030
Nate Begemanfc567d82006-02-03 05:17:06 +000031Support 'update' load/store instructions. These are cracked on the G5, but are
32still a codesize win.
33
34===-------------------------------------------------------------------------===
35
36Should hint to the branch select pass that it doesn't need to print the second
37unconditional branch, so we don't end up with things like:
Misha Brukman2ffb7872004-07-27 18:43:04 +000038 b .LBBl42__2E_expand_function_8_674 ; loopentry.24
39 b .LBBl42__2E_expand_function_8_42 ; NewDefault
40 b .LBBl42__2E_expand_function_8_42 ; NewDefault
Chris Lattner5e3953d2005-08-23 06:27:59 +000041
Chris Lattner81e66ab2006-02-03 22:06:45 +000042This occurs in SPASS.
43
Chris Lattner1e98a332005-08-24 18:15:24 +000044===-------------------------------------------------------------------------===
45
Chris Lattner5e3953d2005-08-23 06:27:59 +000046* Codegen this:
47
48 void test2(int X) {
49 if (X == 0x12345678) bar();
50 }
51
52 as:
53
54 xoris r0,r3,0x1234
55 cmpwi cr0,r0,0x5678
56 beq cr0,L6
57
58 not:
59
60 lis r2, 4660
61 ori r2, r2, 22136
62 cmpw cr0, r3, r2
63 bne .LBB_test2_2
64
Chris Lattner1e98a332005-08-24 18:15:24 +000065===-------------------------------------------------------------------------===
66
67Lump the constant pool for each function into ONE pic object, and reference
68pieces of it as offsets from the start. For functions like this (contrived
69to have lots of constants obviously):
70
71double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
72
73We generate:
74
75_X:
76 lis r2, ha16(.CPI_X_0)
77 lfd f0, lo16(.CPI_X_0)(r2)
78 lis r2, ha16(.CPI_X_1)
79 lfd f2, lo16(.CPI_X_1)(r2)
80 fmadd f0, f1, f0, f2
81 lis r2, ha16(.CPI_X_2)
82 lfd f1, lo16(.CPI_X_2)(r2)
83 lis r2, ha16(.CPI_X_3)
84 lfd f2, lo16(.CPI_X_3)(r2)
85 fmadd f1, f0, f1, f2
86 blr
87
88It would be better to materialize .CPI_X into a register, then use immediates
89off of the register to avoid the lis's. This is even more important in PIC
90mode.
91
Chris Lattner9b178ce2006-02-02 23:50:22 +000092Note that this (and the static variable version) is discussed here for GCC:
93http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
94
Chris Lattner1e98a332005-08-24 18:15:24 +000095===-------------------------------------------------------------------------===
Nate Begemane9e2c6d2005-09-06 15:30:48 +000096
Chris Lattnera23b04a2006-02-03 06:22:11 +000097PIC Code Gen IPO optimization:
98
99Squish small scalar globals together into a single global struct, allowing the
100address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
101of the GOT on targets with one).
102
103Note that this is discussed here for GCC:
104http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
105
106===-------------------------------------------------------------------------===
107
Nate Begemane9e2c6d2005-09-06 15:30:48 +0000108Implement Newton-Rhapson method for improving estimate instructions to the
109correct accuracy, and implementing divide as multiply by reciprocal when it has
110more than one use. Itanium will want this too.
Nate Begeman6cca84e2005-10-16 05:39:50 +0000111
112===-------------------------------------------------------------------------===
113
Nate Begemanff179652005-10-25 23:50:02 +0000114#define ARRAY_LENGTH 16
115
116union bitfield {
117 struct {
118#ifndef __ppc__
119 unsigned int field0 : 6;
120 unsigned int field1 : 6;
121 unsigned int field2 : 6;
122 unsigned int field3 : 6;
123 unsigned int field4 : 3;
124 unsigned int field5 : 4;
125 unsigned int field6 : 1;
126#else
127 unsigned int field6 : 1;
128 unsigned int field5 : 4;
129 unsigned int field4 : 3;
130 unsigned int field3 : 6;
131 unsigned int field2 : 6;
132 unsigned int field1 : 6;
133 unsigned int field0 : 6;
134#endif
135 } bitfields, bits;
136 unsigned int u32All;
137 signed int i32All;
138 float f32All;
139};
140
141
142typedef struct program_t {
143 union bitfield array[ARRAY_LENGTH];
144 int size;
145 int loaded;
146} program;
147
148
149void AdjustBitfields(program* prog, unsigned int fmt1)
150{
151 unsigned int shift = 0;
152 unsigned int texCount = 0;
153 unsigned int i;
154
155 for (i = 0; i < 8; i++)
156 {
157 prog->array[i].bitfields.field0 = texCount;
158 prog->array[i].bitfields.field1 = texCount + 1;
159 prog->array[i].bitfields.field2 = texCount + 2;
160 prog->array[i].bitfields.field3 = texCount + 3;
161
162 texCount += (fmt1 >> shift) & 0x7;
163 shift += 3;
164 }
165}
166
167In the loop above, the bitfield adds get generated as
168(add (shl bitfield, C1), (shl C2, C1)) where C2 is 1, 2 or 3.
169
170Since the input to the (or and, and) is an (add) rather than a (shl), the shift
171doesn't get folded into the rlwimi instruction. We should ideally see through
172things like this, rather than forcing llvm to generate the equivalent
173
174(shl (add bitfield, C2), C1) with some kind of mask.
Chris Lattnera0dfc672005-10-28 00:20:45 +0000175
176===-------------------------------------------------------------------------===
177
Chris Lattner75fe59c2005-11-05 08:57:56 +0000178Compile this:
179
180int %f1(int %a, int %b) {
181 %tmp.1 = and int %a, 15 ; <int> [#uses=1]
182 %tmp.3 = and int %b, 240 ; <int> [#uses=1]
183 %tmp.4 = or int %tmp.3, %tmp.1 ; <int> [#uses=1]
184 ret int %tmp.4
185}
186
187without a copy. We make this currently:
188
189_f1:
190 rlwinm r2, r4, 0, 24, 27
191 rlwimi r2, r3, 0, 28, 31
192 or r3, r2, r2
193 blr
194
195The two-addr pass or RA needs to learn when it is profitable to commute an
196instruction to avoid a copy AFTER the 2-addr instruction. The 2-addr pass
197currently only commutes to avoid inserting a copy BEFORE the two addr instr.
198
Chris Lattner29e6c3d2005-12-08 07:13:28 +0000199===-------------------------------------------------------------------------===
200
201Compile offsets from allocas:
202
203int *%test() {
204 %X = alloca { int, int }
205 %Y = getelementptr {int,int}* %X, int 0, uint 1
206 ret int* %Y
207}
208
209into a single add, not two:
210
211_test:
212 addi r2, r1, -8
213 addi r3, r2, 4
214 blr
215
216--> important for C++.
217
Chris Lattnerffe35422005-12-22 17:19:28 +0000218===-------------------------------------------------------------------------===
219
220int test3(int a, int b) { return (a < 0) ? a : 0; }
221
222should be branch free code. LLVM is turning it into < 1 because of the RHS.
223
224===-------------------------------------------------------------------------===
225
Chris Lattnerffe35422005-12-22 17:19:28 +0000226No loads or stores of the constants should be needed:
227
228struct foo { double X, Y; };
229void xxx(struct foo F);
230void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
231
Chris Lattnerb2eacf42006-01-16 17:53:00 +0000232===-------------------------------------------------------------------------===
233
Chris Lattner7c762902006-01-16 17:58:54 +0000234Darwin Stub LICM optimization:
235
236Loops like this:
237
238 for (...) bar();
239
240Have to go through an indirect stub if bar is external or linkonce. It would
241be better to compile it as:
242
243 fp = &bar;
244 for (...) fp();
245
246which only computes the address of bar once (instead of each time through the
247stub). This is Darwin specific and would have to be done in the code generator.
248Probably not a win on x86.
249
250===-------------------------------------------------------------------------===
251
252PowerPC i1/setcc stuff (depends on subreg stuff):
253
254Check out the PPC code we get for 'compare' in this testcase:
255http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19672
256
257oof. on top of not doing the logical crnand instead of (mfcr, mfcr,
258invert, invert, or), we then have to compare it against zero instead of
259using the value already in a CR!
260
261that should be something like
262 cmpw cr7, r8, r5
263 cmpw cr0, r7, r3
264 crnand cr0, cr0, cr7
265 bne cr0, LBB_compare_4
266
267instead of
268 cmpw cr7, r8, r5
269 cmpw cr0, r7, r3
270 mfcr r7, 1
271 mcrf cr7, cr0
272 mfcr r8, 1
273 rlwinm r7, r7, 30, 31, 31
274 rlwinm r8, r8, 30, 31, 31
275 xori r7, r7, 1
276 xori r8, r8, 1
277 addi r2, r2, 1
278 or r7, r8, r7
279 cmpwi cr0, r7, 0
280 bne cr0, LBB_compare_4 ; loopexit
281
282===-------------------------------------------------------------------------===
283
284Simple IPO for argument passing, change:
285 void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
286
287the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
288of arguments get assigned to r3 through r10. That is, if you have a function
289foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
290argument bytes for r4 and r5. The trick then would be to shuffle the argument
291order for functions we can internalize so that the maximum number of
292integers/pointers get passed in regs before you see any of the fp arguments.
293
294Instead of implementing this, it would actually probably be easier to just
295implement a PPC fastcc, where we could do whatever we wanted to the CC,
296including having this work sanely.
297
298===-------------------------------------------------------------------------===
299
300Fix Darwin FP-In-Integer Registers ABI
301
302Darwin passes doubles in structures in integer registers, which is very very
303bad. Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
304that percolates these things out of functions.
305
306Check out how horrible this is:
307http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
308
309This is an extension of "interprocedural CC unmunging" that can't be done with
310just fastcc.
311
312===-------------------------------------------------------------------------===
313
Chris Lattnerc3c27032006-01-19 02:09:38 +0000314Generate lwbrx and other byteswapping load/store instructions when reasonable.
315
Chris Lattner0c7b4662006-01-28 05:40:47 +0000316===-------------------------------------------------------------------------===
317
318Implement TargetConstantVec, and set up PPC to custom lower ConstantVec into
319TargetConstantVec's if it's one of the many forms that are algorithmically
320computable using the spiffy altivec instructions.
321
Chris Lattnera9bfca82006-01-31 02:55:28 +0000322===-------------------------------------------------------------------------===
323
324Compile this:
325
326double %test(double %X) {
327 %Y = cast double %X to long
328 %Z = cast long %Y to double
329 ret double %Z
330}
331
332to this:
333
334_test:
335 fctidz f0, f1
336 stfd f0, -8(r1)
337 lwz r2, -4(r1)
338 lwz r3, -8(r1)
339 stw r2, -12(r1)
340 stw r3, -16(r1)
341 lfd f0, -16(r1)
342 fcfid f1, f0
343 blr
344
345without the lwz/stw's.
346
Chris Lattnerb0fe1382006-01-31 07:16:34 +0000347===-------------------------------------------------------------------------===
348
349Compile this:
350
351int foo(int a) {
352 int b = (a < 8);
353 if (b) {
354 return b * 3; // ignore the fact that this is always 3.
355 } else {
356 return 2;
357 }
358}
359
360into something not this:
361
362_foo:
3631) cmpwi cr7, r3, 8
364 mfcr r2, 1
365 rlwinm r2, r2, 29, 31, 31
3661) cmpwi cr0, r3, 7
367 bgt cr0, LBB1_2 ; UnifiedReturnBlock
368LBB1_1: ; then
369 rlwinm r2, r2, 0, 31, 31
370 mulli r3, r2, 3
371 blr
372LBB1_2: ; UnifiedReturnBlock
373 li r3, 2
374 blr
375
376In particular, the two compares (marked 1) could be shared by reversing one.
377This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
378same operands (but backwards) exists. In this case, this wouldn't save us
379anything though, because the compares still wouldn't be shared.
Chris Lattnera0527472006-02-01 00:28:12 +0000380
Chris Lattnera983bea2006-02-01 17:54:23 +0000381===-------------------------------------------------------------------------===
382
383The legalizer should lower this:
384
385bool %test(ulong %x) {
386 %tmp = setlt ulong %x, 4294967296
387 ret bool %tmp
388}
389
390into "if x.high == 0", not:
391
392_test:
393 addi r2, r3, -1
394 cntlzw r2, r2
395 cntlzw r3, r3
396 srwi r2, r2, 5
Nate Begemancd018522006-02-02 07:27:56 +0000397 srwi r4, r3, 5
398 li r3, 0
Chris Lattnera983bea2006-02-01 17:54:23 +0000399 cmpwi cr0, r2, 0
400 bne cr0, LBB1_2 ;
401LBB1_1:
Nate Begemancd018522006-02-02 07:27:56 +0000402 or r3, r4, r4
Chris Lattnera983bea2006-02-01 17:54:23 +0000403LBB1_2:
Chris Lattnera983bea2006-02-01 17:54:23 +0000404 blr
405
406noticed in 2005-05-11-Popcount-ffs-fls.c.
Chris Lattner9dd7df72006-02-02 07:37:11 +0000407
408
409===-------------------------------------------------------------------------===
410
411We should custom expand setcc instead of pretending that we have it. That
412would allow us to expose the access of the crbit after the mfcr, allowing
413that access to be trivially folded into other ops. A simple example:
414
415int foo(int a, int b) { return (a < b) << 4; }
416
417compiles into:
418
419_foo:
420 cmpw cr7, r3, r4
421 mfcr r2, 1
422 rlwinm r2, r2, 29, 31, 31
423 slwi r3, r2, 4
424 blr
425
Chris Lattnerf0a2d662006-02-03 01:49:49 +0000426===-------------------------------------------------------------------------===
427
Nate Begemanfc567d82006-02-03 05:17:06 +0000428Fold add and sub with constant into non-extern, non-weak addresses so this:
429
430static int a;
431void bar(int b) { a = b; }
432void foo(unsigned char *c) {
433 *c = a;
434}
435
436So that
437
438_foo:
439 lis r2, ha16(_a)
440 la r2, lo16(_a)(r2)
441 lbz r2, 3(r2)
442 stb r2, 0(r3)
443 blr
444
445Becomes
446
447_foo:
448 lis r2, ha16(_a+3)
449 lbz r2, lo16(_a+3)(r2)
450 stb r2, 0(r3)
451 blr
Chris Lattnerc0e48c62006-02-05 05:27:35 +0000452
453===-------------------------------------------------------------------------===
454
455We generate really bad code for this:
456
457int f(signed char *a, _Bool b, _Bool c) {
458 signed char t = 0;
459 if (b) t = *a;
460 if (c) *a = t;
461}
462