blob: 42d701ef01ba9210bf053dcbe6032eef90d30df1 [file] [log] [blame]
Nate Begemanb64af912004-08-10 20:42:36 +00001TODO:
Nate Begemanef9531e2005-04-11 20:48:57 +00002* gpr0 allocation
Nate Begeman4a0de072004-10-26 04:10:53 +00003* implement do-loop -> bdnz transform
Nate Begemanca068e82004-08-14 22:16:36 +00004* implement powerpc-64 for darwin
Nate Begeman50fb3c42005-12-24 01:00:15 +00005
Nate Begemana63fee82006-02-03 05:17:06 +00006===-------------------------------------------------------------------------===
Nate Begeman50fb3c42005-12-24 01:00:15 +00007
Nate Begemana63fee82006-02-03 05:17:06 +00008Use the stfiwx instruction for:
Chris Lattnerb65975a2005-07-26 19:07:51 +00009
Nate Begemana63fee82006-02-03 05:17:06 +000010void foo(float a, int *b) { *b = a; }
11
12===-------------------------------------------------------------------------===
13
Nate Begeman5a014812005-08-14 01:17:16 +000014unsigned short foo(float a) { return a; }
Nate Begemana63fee82006-02-03 05:17:06 +000015should be:
Nate Begeman5a014812005-08-14 01:17:16 +000016_foo:
17 fctiwz f0,f1
18 stfd f0,-8(r1)
19 lhz r3,-2(r1)
20 blr
21not:
22_foo:
23 fctiwz f0, f1
24 stfd f0, -8(r1)
25 lwz r2, -4(r1)
26 rlwinm r3, r2, 0, 16, 31
27 blr
28
Nate Begemana63fee82006-02-03 05:17:06 +000029===-------------------------------------------------------------------------===
Chris Lattner6281ae42005-08-05 19:18:32 +000030
Nate Begemana63fee82006-02-03 05:17:06 +000031Support 'update' load/store instructions. These are cracked on the G5, but are
32still a codesize win.
33
34===-------------------------------------------------------------------------===
35
36Should hint to the branch select pass that it doesn't need to print the second
37unconditional branch, so we don't end up with things like:
Misha Brukman4ce5ce22004-07-27 18:43:04 +000038 b .LBBl42__2E_expand_function_8_674 ; loopentry.24
39 b .LBBl42__2E_expand_function_8_42 ; NewDefault
40 b .LBBl42__2E_expand_function_8_42 ; NewDefault
Chris Lattner424dcbd2005-08-23 06:27:59 +000041
Chris Lattnera3c44542005-08-24 18:15:24 +000042===-------------------------------------------------------------------------===
43
Chris Lattner424dcbd2005-08-23 06:27:59 +000044* Codegen this:
45
46 void test2(int X) {
47 if (X == 0x12345678) bar();
48 }
49
50 as:
51
52 xoris r0,r3,0x1234
53 cmpwi cr0,r0,0x5678
54 beq cr0,L6
55
56 not:
57
58 lis r2, 4660
59 ori r2, r2, 22136
60 cmpw cr0, r3, r2
61 bne .LBB_test2_2
62
Chris Lattnera3c44542005-08-24 18:15:24 +000063===-------------------------------------------------------------------------===
64
65Lump the constant pool for each function into ONE pic object, and reference
66pieces of it as offsets from the start. For functions like this (contrived
67to have lots of constants obviously):
68
69double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
70
71We generate:
72
73_X:
74 lis r2, ha16(.CPI_X_0)
75 lfd f0, lo16(.CPI_X_0)(r2)
76 lis r2, ha16(.CPI_X_1)
77 lfd f2, lo16(.CPI_X_1)(r2)
78 fmadd f0, f1, f0, f2
79 lis r2, ha16(.CPI_X_2)
80 lfd f1, lo16(.CPI_X_2)(r2)
81 lis r2, ha16(.CPI_X_3)
82 lfd f2, lo16(.CPI_X_3)(r2)
83 fmadd f1, f0, f1, f2
84 blr
85
86It would be better to materialize .CPI_X into a register, then use immediates
87off of the register to avoid the lis's. This is even more important in PIC
88mode.
89
Chris Lattner39b248b2006-02-02 23:50:22 +000090Note that this (and the static variable version) is discussed here for GCC:
91http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
92
Chris Lattnera3c44542005-08-24 18:15:24 +000093===-------------------------------------------------------------------------===
Nate Begeman92cce902005-09-06 15:30:48 +000094
Chris Lattner33c1dab2006-02-03 06:22:11 +000095PIC Code Gen IPO optimization:
96
97Squish small scalar globals together into a single global struct, allowing the
98address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
99of the GOT on targets with one).
100
101Note that this is discussed here for GCC:
102http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
103
104===-------------------------------------------------------------------------===
105
Nate Begeman92cce902005-09-06 15:30:48 +0000106Implement Newton-Rhapson method for improving estimate instructions to the
107correct accuracy, and implementing divide as multiply by reciprocal when it has
108more than one use. Itanium will want this too.
Nate Begeman21e463b2005-10-16 05:39:50 +0000109
110===-------------------------------------------------------------------------===
111
Nate Begeman5cd61ce2005-10-25 23:50:02 +0000112#define ARRAY_LENGTH 16
113
114union bitfield {
115 struct {
116#ifndef __ppc__
117 unsigned int field0 : 6;
118 unsigned int field1 : 6;
119 unsigned int field2 : 6;
120 unsigned int field3 : 6;
121 unsigned int field4 : 3;
122 unsigned int field5 : 4;
123 unsigned int field6 : 1;
124#else
125 unsigned int field6 : 1;
126 unsigned int field5 : 4;
127 unsigned int field4 : 3;
128 unsigned int field3 : 6;
129 unsigned int field2 : 6;
130 unsigned int field1 : 6;
131 unsigned int field0 : 6;
132#endif
133 } bitfields, bits;
134 unsigned int u32All;
135 signed int i32All;
136 float f32All;
137};
138
139
140typedef struct program_t {
141 union bitfield array[ARRAY_LENGTH];
142 int size;
143 int loaded;
144} program;
145
146
147void AdjustBitfields(program* prog, unsigned int fmt1)
148{
149 unsigned int shift = 0;
150 unsigned int texCount = 0;
151 unsigned int i;
152
153 for (i = 0; i < 8; i++)
154 {
155 prog->array[i].bitfields.field0 = texCount;
156 prog->array[i].bitfields.field1 = texCount + 1;
157 prog->array[i].bitfields.field2 = texCount + 2;
158 prog->array[i].bitfields.field3 = texCount + 3;
159
160 texCount += (fmt1 >> shift) & 0x7;
161 shift += 3;
162 }
163}
164
165In the loop above, the bitfield adds get generated as
166(add (shl bitfield, C1), (shl C2, C1)) where C2 is 1, 2 or 3.
167
168Since the input to the (or and, and) is an (add) rather than a (shl), the shift
169doesn't get folded into the rlwimi instruction. We should ideally see through
170things like this, rather than forcing llvm to generate the equivalent
171
172(shl (add bitfield, C2), C1) with some kind of mask.
Chris Lattner01959102005-10-28 00:20:45 +0000173
174===-------------------------------------------------------------------------===
175
Chris Lattnerae4664a2005-11-05 08:57:56 +0000176Compile this:
177
178int %f1(int %a, int %b) {
179 %tmp.1 = and int %a, 15 ; <int> [#uses=1]
180 %tmp.3 = and int %b, 240 ; <int> [#uses=1]
181 %tmp.4 = or int %tmp.3, %tmp.1 ; <int> [#uses=1]
182 ret int %tmp.4
183}
184
185without a copy. We make this currently:
186
187_f1:
188 rlwinm r2, r4, 0, 24, 27
189 rlwimi r2, r3, 0, 28, 31
190 or r3, r2, r2
191 blr
192
193The two-addr pass or RA needs to learn when it is profitable to commute an
194instruction to avoid a copy AFTER the 2-addr instruction. The 2-addr pass
195currently only commutes to avoid inserting a copy BEFORE the two addr instr.
196
Chris Lattner62c08dd2005-12-08 07:13:28 +0000197===-------------------------------------------------------------------------===
198
Nate Begemaneb20ed62006-01-28 01:22:10 +0000199176.gcc contains a bunch of code like this (this occurs dozens of times):
200
201int %test(uint %mode.0.i.0) {
202 %tmp.79 = cast uint %mode.0.i.0 to sbyte ; <sbyte> [#uses=1]
203 %tmp.80 = cast sbyte %tmp.79 to int ; <int> [#uses=1]
204 %tmp.81 = shl int %tmp.80, ubyte 16 ; <int> [#uses=1]
205 %tmp.82 = and int %tmp.81, 16711680
206 ret int %tmp.82
207}
208
209which we compile to:
210
211_test:
212 extsb r2, r3
213 rlwinm r3, r2, 16, 8, 15
214 blr
215
216The extsb is obviously dead. This can be handled by a future thing like
217MaskedValueIsZero that checks to see if bits are ever demanded (in this case,
218the sign bits are never used, so we can fold the sext_inreg to nothing).
219
220I'm seeing code like this:
221
222 srwi r3, r3, 16
223 extsb r3, r3
224 rlwimi r4, r3, 16, 8, 15
225
226in which the extsb is preventing the srwi from being nuked.
227
228===-------------------------------------------------------------------------===
229
230Another example that occurs is:
231
232uint %test(int %specbits.6.1) {
233 %tmp.2540 = shr int %specbits.6.1, ubyte 11 ; <int> [#uses=1]
234 %tmp.2541 = cast int %tmp.2540 to uint ; <uint> [#uses=1]
235 %tmp.2542 = shl uint %tmp.2541, ubyte 13 ; <uint> [#uses=1]
236 %tmp.2543 = and uint %tmp.2542, 8192 ; <uint> [#uses=1]
237 ret uint %tmp.2543
238}
239
240which we codegen as:
241
242l1_test:
243 srawi r2, r3, 11
244 rlwinm r3, r2, 13, 18, 18
245 blr
246
247the srawi can be nuked by turning the SAR into a logical SHR (the sext bits are
248dead), which I think can then be folded into the rlwinm.
249
250===-------------------------------------------------------------------------===
251
Chris Lattner62c08dd2005-12-08 07:13:28 +0000252Compile offsets from allocas:
253
254int *%test() {
255 %X = alloca { int, int }
256 %Y = getelementptr {int,int}* %X, int 0, uint 1
257 ret int* %Y
258}
259
260into a single add, not two:
261
262_test:
263 addi r2, r1, -8
264 addi r3, r2, 4
265 blr
266
267--> important for C++.
268
Chris Lattner39706e62005-12-22 17:19:28 +0000269===-------------------------------------------------------------------------===
270
271int test3(int a, int b) { return (a < 0) ? a : 0; }
272
273should be branch free code. LLVM is turning it into < 1 because of the RHS.
274
275===-------------------------------------------------------------------------===
276
Chris Lattner39706e62005-12-22 17:19:28 +0000277No loads or stores of the constants should be needed:
278
279struct foo { double X, Y; };
280void xxx(struct foo F);
281void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
282
Chris Lattner1db4b4f2006-01-16 17:53:00 +0000283===-------------------------------------------------------------------------===
284
Chris Lattner98fbc2f2006-01-16 17:58:54 +0000285Darwin Stub LICM optimization:
286
287Loops like this:
288
289 for (...) bar();
290
291Have to go through an indirect stub if bar is external or linkonce. It would
292be better to compile it as:
293
294 fp = &bar;
295 for (...) fp();
296
297which only computes the address of bar once (instead of each time through the
298stub). This is Darwin specific and would have to be done in the code generator.
299Probably not a win on x86.
300
301===-------------------------------------------------------------------------===
302
303PowerPC i1/setcc stuff (depends on subreg stuff):
304
305Check out the PPC code we get for 'compare' in this testcase:
306http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19672
307
308oof. on top of not doing the logical crnand instead of (mfcr, mfcr,
309invert, invert, or), we then have to compare it against zero instead of
310using the value already in a CR!
311
312that should be something like
313 cmpw cr7, r8, r5
314 cmpw cr0, r7, r3
315 crnand cr0, cr0, cr7
316 bne cr0, LBB_compare_4
317
318instead of
319 cmpw cr7, r8, r5
320 cmpw cr0, r7, r3
321 mfcr r7, 1
322 mcrf cr7, cr0
323 mfcr r8, 1
324 rlwinm r7, r7, 30, 31, 31
325 rlwinm r8, r8, 30, 31, 31
326 xori r7, r7, 1
327 xori r8, r8, 1
328 addi r2, r2, 1
329 or r7, r8, r7
330 cmpwi cr0, r7, 0
331 bne cr0, LBB_compare_4 ; loopexit
332
333===-------------------------------------------------------------------------===
334
335Simple IPO for argument passing, change:
336 void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
337
338the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
339of arguments get assigned to r3 through r10. That is, if you have a function
340foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
341argument bytes for r4 and r5. The trick then would be to shuffle the argument
342order for functions we can internalize so that the maximum number of
343integers/pointers get passed in regs before you see any of the fp arguments.
344
345Instead of implementing this, it would actually probably be easier to just
346implement a PPC fastcc, where we could do whatever we wanted to the CC,
347including having this work sanely.
348
349===-------------------------------------------------------------------------===
350
351Fix Darwin FP-In-Integer Registers ABI
352
353Darwin passes doubles in structures in integer registers, which is very very
354bad. Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
355that percolates these things out of functions.
356
357Check out how horrible this is:
358http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
359
360This is an extension of "interprocedural CC unmunging" that can't be done with
361just fastcc.
362
363===-------------------------------------------------------------------------===
364
Chris Lattner3cda14f2006-01-19 02:09:38 +0000365Generate lwbrx and other byteswapping load/store instructions when reasonable.
366
Chris Lattner96909792006-01-28 05:40:47 +0000367===-------------------------------------------------------------------------===
368
369Implement TargetConstantVec, and set up PPC to custom lower ConstantVec into
370TargetConstantVec's if it's one of the many forms that are algorithmically
371computable using the spiffy altivec instructions.
372
Chris Lattner56b69642006-01-31 02:55:28 +0000373===-------------------------------------------------------------------------===
374
375Compile this:
376
377double %test(double %X) {
378 %Y = cast double %X to long
379 %Z = cast long %Y to double
380 ret double %Z
381}
382
383to this:
384
385_test:
386 fctidz f0, f1
387 stfd f0, -8(r1)
388 lwz r2, -4(r1)
389 lwz r3, -8(r1)
390 stw r2, -12(r1)
391 stw r3, -16(r1)
392 lfd f0, -16(r1)
393 fcfid f1, f0
394 blr
395
396without the lwz/stw's.
397
Chris Lattner83e64ba2006-01-31 07:16:34 +0000398===-------------------------------------------------------------------------===
399
400Compile this:
401
402int foo(int a) {
403 int b = (a < 8);
404 if (b) {
405 return b * 3; // ignore the fact that this is always 3.
406 } else {
407 return 2;
408 }
409}
410
411into something not this:
412
413_foo:
4141) cmpwi cr7, r3, 8
415 mfcr r2, 1
416 rlwinm r2, r2, 29, 31, 31
4171) cmpwi cr0, r3, 7
418 bgt cr0, LBB1_2 ; UnifiedReturnBlock
419LBB1_1: ; then
420 rlwinm r2, r2, 0, 31, 31
421 mulli r3, r2, 3
422 blr
423LBB1_2: ; UnifiedReturnBlock
424 li r3, 2
425 blr
426
427In particular, the two compares (marked 1) could be shared by reversing one.
428This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
429same operands (but backwards) exists. In this case, this wouldn't save us
430anything though, because the compares still wouldn't be shared.
Chris Lattner0ddc1802006-02-01 00:28:12 +0000431
Chris Lattner5a7efc92006-02-01 17:54:23 +0000432===-------------------------------------------------------------------------===
433
434The legalizer should lower this:
435
436bool %test(ulong %x) {
437 %tmp = setlt ulong %x, 4294967296
438 ret bool %tmp
439}
440
441into "if x.high == 0", not:
442
443_test:
444 addi r2, r3, -1
445 cntlzw r2, r2
446 cntlzw r3, r3
447 srwi r2, r2, 5
Nate Begeman93c740b2006-02-02 07:27:56 +0000448 srwi r4, r3, 5
449 li r3, 0
Chris Lattner5a7efc92006-02-01 17:54:23 +0000450 cmpwi cr0, r2, 0
451 bne cr0, LBB1_2 ;
452LBB1_1:
Nate Begeman93c740b2006-02-02 07:27:56 +0000453 or r3, r4, r4
Chris Lattner5a7efc92006-02-01 17:54:23 +0000454LBB1_2:
Chris Lattner5a7efc92006-02-01 17:54:23 +0000455 blr
456
457noticed in 2005-05-11-Popcount-ffs-fls.c.
Chris Lattner275b8842006-02-02 07:37:11 +0000458
459
460===-------------------------------------------------------------------------===
461
462We should custom expand setcc instead of pretending that we have it. That
463would allow us to expose the access of the crbit after the mfcr, allowing
464that access to be trivially folded into other ops. A simple example:
465
466int foo(int a, int b) { return (a < b) << 4; }
467
468compiles into:
469
470_foo:
471 cmpw cr7, r3, r4
472 mfcr r2, 1
473 rlwinm r2, r2, 29, 31, 31
474 slwi r3, r2, 4
475 blr
476
Chris Lattnerd463f7f2006-02-03 01:49:49 +0000477===-------------------------------------------------------------------------===
478
Nate Begemana63fee82006-02-03 05:17:06 +0000479Fold add and sub with constant into non-extern, non-weak addresses so this:
480
481static int a;
482void bar(int b) { a = b; }
483void foo(unsigned char *c) {
484 *c = a;
485}
486
487So that
488
489_foo:
490 lis r2, ha16(_a)
491 la r2, lo16(_a)(r2)
492 lbz r2, 3(r2)
493 stb r2, 0(r3)
494 blr
495
496Becomes
497
498_foo:
499 lis r2, ha16(_a+3)
500 lbz r2, lo16(_a+3)(r2)
501 stb r2, 0(r3)
502 blr