blob: f90073f2cace08bf309419cd11930a1a7f2a9056 [file] [log] [blame]
Nate Begemanb64af912004-08-10 20:42:36 +00001TODO:
Nate Begemanef9531e2005-04-11 20:48:57 +00002* gpr0 allocation
Nate Begeman4a0de072004-10-26 04:10:53 +00003* implement do-loop -> bdnz transform
Nate Begemanca068e82004-08-14 22:16:36 +00004* implement powerpc-64 for darwin
Nate Begeman50fb3c42005-12-24 01:00:15 +00005
Nate Begemana63fee82006-02-03 05:17:06 +00006===-------------------------------------------------------------------------===
Nate Begeman50fb3c42005-12-24 01:00:15 +00007
Nate Begemana63fee82006-02-03 05:17:06 +00008Use the stfiwx instruction for:
Chris Lattnerb65975a2005-07-26 19:07:51 +00009
Nate Begemana63fee82006-02-03 05:17:06 +000010void foo(float a, int *b) { *b = a; }
11
12===-------------------------------------------------------------------------===
13
Nate Begeman5a014812005-08-14 01:17:16 +000014unsigned short foo(float a) { return a; }
Nate Begemana63fee82006-02-03 05:17:06 +000015should be:
Nate Begeman5a014812005-08-14 01:17:16 +000016_foo:
17 fctiwz f0,f1
18 stfd f0,-8(r1)
19 lhz r3,-2(r1)
20 blr
21not:
22_foo:
23 fctiwz f0, f1
24 stfd f0, -8(r1)
25 lwz r2, -4(r1)
26 rlwinm r3, r2, 0, 16, 31
27 blr
28
Nate Begemana63fee82006-02-03 05:17:06 +000029===-------------------------------------------------------------------------===
Chris Lattner6281ae42005-08-05 19:18:32 +000030
Nate Begemana63fee82006-02-03 05:17:06 +000031Support 'update' load/store instructions. These are cracked on the G5, but are
32still a codesize win.
33
34===-------------------------------------------------------------------------===
35
36Should hint to the branch select pass that it doesn't need to print the second
37unconditional branch, so we don't end up with things like:
Misha Brukman4ce5ce22004-07-27 18:43:04 +000038 b .LBBl42__2E_expand_function_8_674 ; loopentry.24
39 b .LBBl42__2E_expand_function_8_42 ; NewDefault
40 b .LBBl42__2E_expand_function_8_42 ; NewDefault
Chris Lattner424dcbd2005-08-23 06:27:59 +000041
Chris Lattner1541bc32006-02-03 22:06:45 +000042This occurs in SPASS.
43
Chris Lattnera3c44542005-08-24 18:15:24 +000044===-------------------------------------------------------------------------===
45
Chris Lattner424dcbd2005-08-23 06:27:59 +000046* Codegen this:
47
48 void test2(int X) {
49 if (X == 0x12345678) bar();
50 }
51
52 as:
53
54 xoris r0,r3,0x1234
55 cmpwi cr0,r0,0x5678
56 beq cr0,L6
57
58 not:
59
60 lis r2, 4660
61 ori r2, r2, 22136
62 cmpw cr0, r3, r2
63 bne .LBB_test2_2
64
Chris Lattnera3c44542005-08-24 18:15:24 +000065===-------------------------------------------------------------------------===
66
67Lump the constant pool for each function into ONE pic object, and reference
68pieces of it as offsets from the start. For functions like this (contrived
69to have lots of constants obviously):
70
71double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
72
73We generate:
74
75_X:
76 lis r2, ha16(.CPI_X_0)
77 lfd f0, lo16(.CPI_X_0)(r2)
78 lis r2, ha16(.CPI_X_1)
79 lfd f2, lo16(.CPI_X_1)(r2)
80 fmadd f0, f1, f0, f2
81 lis r2, ha16(.CPI_X_2)
82 lfd f1, lo16(.CPI_X_2)(r2)
83 lis r2, ha16(.CPI_X_3)
84 lfd f2, lo16(.CPI_X_3)(r2)
85 fmadd f1, f0, f1, f2
86 blr
87
88It would be better to materialize .CPI_X into a register, then use immediates
89off of the register to avoid the lis's. This is even more important in PIC
90mode.
91
Chris Lattner39b248b2006-02-02 23:50:22 +000092Note that this (and the static variable version) is discussed here for GCC:
93http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
94
Chris Lattnera3c44542005-08-24 18:15:24 +000095===-------------------------------------------------------------------------===
Nate Begeman92cce902005-09-06 15:30:48 +000096
Chris Lattner33c1dab2006-02-03 06:22:11 +000097PIC Code Gen IPO optimization:
98
99Squish small scalar globals together into a single global struct, allowing the
100address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
101of the GOT on targets with one).
102
103Note that this is discussed here for GCC:
104http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
105
106===-------------------------------------------------------------------------===
107
Nate Begeman92cce902005-09-06 15:30:48 +0000108Implement Newton-Rhapson method for improving estimate instructions to the
109correct accuracy, and implementing divide as multiply by reciprocal when it has
110more than one use. Itanium will want this too.
Nate Begeman21e463b2005-10-16 05:39:50 +0000111
112===-------------------------------------------------------------------------===
113
Nate Begeman5cd61ce2005-10-25 23:50:02 +0000114#define ARRAY_LENGTH 16
115
116union bitfield {
117 struct {
118#ifndef __ppc__
119 unsigned int field0 : 6;
120 unsigned int field1 : 6;
121 unsigned int field2 : 6;
122 unsigned int field3 : 6;
123 unsigned int field4 : 3;
124 unsigned int field5 : 4;
125 unsigned int field6 : 1;
126#else
127 unsigned int field6 : 1;
128 unsigned int field5 : 4;
129 unsigned int field4 : 3;
130 unsigned int field3 : 6;
131 unsigned int field2 : 6;
132 unsigned int field1 : 6;
133 unsigned int field0 : 6;
134#endif
135 } bitfields, bits;
136 unsigned int u32All;
137 signed int i32All;
138 float f32All;
139};
140
141
142typedef struct program_t {
143 union bitfield array[ARRAY_LENGTH];
144 int size;
145 int loaded;
146} program;
147
148
149void AdjustBitfields(program* prog, unsigned int fmt1)
150{
151 unsigned int shift = 0;
152 unsigned int texCount = 0;
153 unsigned int i;
154
155 for (i = 0; i < 8; i++)
156 {
157 prog->array[i].bitfields.field0 = texCount;
158 prog->array[i].bitfields.field1 = texCount + 1;
159 prog->array[i].bitfields.field2 = texCount + 2;
160 prog->array[i].bitfields.field3 = texCount + 3;
161
162 texCount += (fmt1 >> shift) & 0x7;
163 shift += 3;
164 }
165}
166
167In the loop above, the bitfield adds get generated as
168(add (shl bitfield, C1), (shl C2, C1)) where C2 is 1, 2 or 3.
169
170Since the input to the (or and, and) is an (add) rather than a (shl), the shift
171doesn't get folded into the rlwimi instruction. We should ideally see through
172things like this, rather than forcing llvm to generate the equivalent
173
174(shl (add bitfield, C2), C1) with some kind of mask.
Chris Lattner01959102005-10-28 00:20:45 +0000175
176===-------------------------------------------------------------------------===
177
Chris Lattnerae4664a2005-11-05 08:57:56 +0000178Compile this:
179
180int %f1(int %a, int %b) {
181 %tmp.1 = and int %a, 15 ; <int> [#uses=1]
182 %tmp.3 = and int %b, 240 ; <int> [#uses=1]
183 %tmp.4 = or int %tmp.3, %tmp.1 ; <int> [#uses=1]
184 ret int %tmp.4
185}
186
187without a copy. We make this currently:
188
189_f1:
190 rlwinm r2, r4, 0, 24, 27
191 rlwimi r2, r3, 0, 28, 31
192 or r3, r2, r2
193 blr
194
195The two-addr pass or RA needs to learn when it is profitable to commute an
196instruction to avoid a copy AFTER the 2-addr instruction. The 2-addr pass
197currently only commutes to avoid inserting a copy BEFORE the two addr instr.
198
Chris Lattner62c08dd2005-12-08 07:13:28 +0000199===-------------------------------------------------------------------------===
200
Nate Begemaneb20ed62006-01-28 01:22:10 +0000201176.gcc contains a bunch of code like this (this occurs dozens of times):
202
203int %test(uint %mode.0.i.0) {
204 %tmp.79 = cast uint %mode.0.i.0 to sbyte ; <sbyte> [#uses=1]
205 %tmp.80 = cast sbyte %tmp.79 to int ; <int> [#uses=1]
206 %tmp.81 = shl int %tmp.80, ubyte 16 ; <int> [#uses=1]
207 %tmp.82 = and int %tmp.81, 16711680
208 ret int %tmp.82
209}
210
211which we compile to:
212
213_test:
214 extsb r2, r3
215 rlwinm r3, r2, 16, 8, 15
216 blr
217
218The extsb is obviously dead. This can be handled by a future thing like
219MaskedValueIsZero that checks to see if bits are ever demanded (in this case,
220the sign bits are never used, so we can fold the sext_inreg to nothing).
221
222I'm seeing code like this:
223
224 srwi r3, r3, 16
225 extsb r3, r3
226 rlwimi r4, r3, 16, 8, 15
227
228in which the extsb is preventing the srwi from being nuked.
229
230===-------------------------------------------------------------------------===
231
232Another example that occurs is:
233
234uint %test(int %specbits.6.1) {
235 %tmp.2540 = shr int %specbits.6.1, ubyte 11 ; <int> [#uses=1]
236 %tmp.2541 = cast int %tmp.2540 to uint ; <uint> [#uses=1]
237 %tmp.2542 = shl uint %tmp.2541, ubyte 13 ; <uint> [#uses=1]
238 %tmp.2543 = and uint %tmp.2542, 8192 ; <uint> [#uses=1]
239 ret uint %tmp.2543
240}
241
242which we codegen as:
243
244l1_test:
245 srawi r2, r3, 11
246 rlwinm r3, r2, 13, 18, 18
247 blr
248
249the srawi can be nuked by turning the SAR into a logical SHR (the sext bits are
250dead), which I think can then be folded into the rlwinm.
251
252===-------------------------------------------------------------------------===
253
Chris Lattner62c08dd2005-12-08 07:13:28 +0000254Compile offsets from allocas:
255
256int *%test() {
257 %X = alloca { int, int }
258 %Y = getelementptr {int,int}* %X, int 0, uint 1
259 ret int* %Y
260}
261
262into a single add, not two:
263
264_test:
265 addi r2, r1, -8
266 addi r3, r2, 4
267 blr
268
269--> important for C++.
270
Chris Lattner39706e62005-12-22 17:19:28 +0000271===-------------------------------------------------------------------------===
272
273int test3(int a, int b) { return (a < 0) ? a : 0; }
274
275should be branch free code. LLVM is turning it into < 1 because of the RHS.
276
277===-------------------------------------------------------------------------===
278
Chris Lattner39706e62005-12-22 17:19:28 +0000279No loads or stores of the constants should be needed:
280
281struct foo { double X, Y; };
282void xxx(struct foo F);
283void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
284
Chris Lattner1db4b4f2006-01-16 17:53:00 +0000285===-------------------------------------------------------------------------===
286
Chris Lattner98fbc2f2006-01-16 17:58:54 +0000287Darwin Stub LICM optimization:
288
289Loops like this:
290
291 for (...) bar();
292
293Have to go through an indirect stub if bar is external or linkonce. It would
294be better to compile it as:
295
296 fp = &bar;
297 for (...) fp();
298
299which only computes the address of bar once (instead of each time through the
300stub). This is Darwin specific and would have to be done in the code generator.
301Probably not a win on x86.
302
303===-------------------------------------------------------------------------===
304
305PowerPC i1/setcc stuff (depends on subreg stuff):
306
307Check out the PPC code we get for 'compare' in this testcase:
308http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19672
309
310oof. on top of not doing the logical crnand instead of (mfcr, mfcr,
311invert, invert, or), we then have to compare it against zero instead of
312using the value already in a CR!
313
314that should be something like
315 cmpw cr7, r8, r5
316 cmpw cr0, r7, r3
317 crnand cr0, cr0, cr7
318 bne cr0, LBB_compare_4
319
320instead of
321 cmpw cr7, r8, r5
322 cmpw cr0, r7, r3
323 mfcr r7, 1
324 mcrf cr7, cr0
325 mfcr r8, 1
326 rlwinm r7, r7, 30, 31, 31
327 rlwinm r8, r8, 30, 31, 31
328 xori r7, r7, 1
329 xori r8, r8, 1
330 addi r2, r2, 1
331 or r7, r8, r7
332 cmpwi cr0, r7, 0
333 bne cr0, LBB_compare_4 ; loopexit
334
335===-------------------------------------------------------------------------===
336
337Simple IPO for argument passing, change:
338 void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
339
340the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
341of arguments get assigned to r3 through r10. That is, if you have a function
342foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
343argument bytes for r4 and r5. The trick then would be to shuffle the argument
344order for functions we can internalize so that the maximum number of
345integers/pointers get passed in regs before you see any of the fp arguments.
346
347Instead of implementing this, it would actually probably be easier to just
348implement a PPC fastcc, where we could do whatever we wanted to the CC,
349including having this work sanely.
350
351===-------------------------------------------------------------------------===
352
353Fix Darwin FP-In-Integer Registers ABI
354
355Darwin passes doubles in structures in integer registers, which is very very
356bad. Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
357that percolates these things out of functions.
358
359Check out how horrible this is:
360http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
361
362This is an extension of "interprocedural CC unmunging" that can't be done with
363just fastcc.
364
365===-------------------------------------------------------------------------===
366
Chris Lattner3cda14f2006-01-19 02:09:38 +0000367Generate lwbrx and other byteswapping load/store instructions when reasonable.
368
Chris Lattner96909792006-01-28 05:40:47 +0000369===-------------------------------------------------------------------------===
370
371Implement TargetConstantVec, and set up PPC to custom lower ConstantVec into
372TargetConstantVec's if it's one of the many forms that are algorithmically
373computable using the spiffy altivec instructions.
374
Chris Lattner56b69642006-01-31 02:55:28 +0000375===-------------------------------------------------------------------------===
376
377Compile this:
378
379double %test(double %X) {
380 %Y = cast double %X to long
381 %Z = cast long %Y to double
382 ret double %Z
383}
384
385to this:
386
387_test:
388 fctidz f0, f1
389 stfd f0, -8(r1)
390 lwz r2, -4(r1)
391 lwz r3, -8(r1)
392 stw r2, -12(r1)
393 stw r3, -16(r1)
394 lfd f0, -16(r1)
395 fcfid f1, f0
396 blr
397
398without the lwz/stw's.
399
Chris Lattner83e64ba2006-01-31 07:16:34 +0000400===-------------------------------------------------------------------------===
401
402Compile this:
403
404int foo(int a) {
405 int b = (a < 8);
406 if (b) {
407 return b * 3; // ignore the fact that this is always 3.
408 } else {
409 return 2;
410 }
411}
412
413into something not this:
414
415_foo:
4161) cmpwi cr7, r3, 8
417 mfcr r2, 1
418 rlwinm r2, r2, 29, 31, 31
4191) cmpwi cr0, r3, 7
420 bgt cr0, LBB1_2 ; UnifiedReturnBlock
421LBB1_1: ; then
422 rlwinm r2, r2, 0, 31, 31
423 mulli r3, r2, 3
424 blr
425LBB1_2: ; UnifiedReturnBlock
426 li r3, 2
427 blr
428
429In particular, the two compares (marked 1) could be shared by reversing one.
430This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
431same operands (but backwards) exists. In this case, this wouldn't save us
432anything though, because the compares still wouldn't be shared.
Chris Lattner0ddc1802006-02-01 00:28:12 +0000433
Chris Lattner5a7efc92006-02-01 17:54:23 +0000434===-------------------------------------------------------------------------===
435
436The legalizer should lower this:
437
438bool %test(ulong %x) {
439 %tmp = setlt ulong %x, 4294967296
440 ret bool %tmp
441}
442
443into "if x.high == 0", not:
444
445_test:
446 addi r2, r3, -1
447 cntlzw r2, r2
448 cntlzw r3, r3
449 srwi r2, r2, 5
Nate Begeman93c740b2006-02-02 07:27:56 +0000450 srwi r4, r3, 5
451 li r3, 0
Chris Lattner5a7efc92006-02-01 17:54:23 +0000452 cmpwi cr0, r2, 0
453 bne cr0, LBB1_2 ;
454LBB1_1:
Nate Begeman93c740b2006-02-02 07:27:56 +0000455 or r3, r4, r4
Chris Lattner5a7efc92006-02-01 17:54:23 +0000456LBB1_2:
Chris Lattner5a7efc92006-02-01 17:54:23 +0000457 blr
458
459noticed in 2005-05-11-Popcount-ffs-fls.c.
Chris Lattner275b8842006-02-02 07:37:11 +0000460
461
462===-------------------------------------------------------------------------===
463
464We should custom expand setcc instead of pretending that we have it. That
465would allow us to expose the access of the crbit after the mfcr, allowing
466that access to be trivially folded into other ops. A simple example:
467
468int foo(int a, int b) { return (a < b) << 4; }
469
470compiles into:
471
472_foo:
473 cmpw cr7, r3, r4
474 mfcr r2, 1
475 rlwinm r2, r2, 29, 31, 31
476 slwi r3, r2, 4
477 blr
478
Chris Lattnerd463f7f2006-02-03 01:49:49 +0000479===-------------------------------------------------------------------------===
480
Nate Begemana63fee82006-02-03 05:17:06 +0000481Fold add and sub with constant into non-extern, non-weak addresses so this:
482
483static int a;
484void bar(int b) { a = b; }
485void foo(unsigned char *c) {
486 *c = a;
487}
488
489So that
490
491_foo:
492 lis r2, ha16(_a)
493 la r2, lo16(_a)(r2)
494 lbz r2, 3(r2)
495 stb r2, 0(r3)
496 blr
497
498Becomes
499
500_foo:
501 lis r2, ha16(_a+3)
502 lbz r2, lo16(_a+3)(r2)
503 stb r2, 0(r3)
504 blr